From fd99c983711d77967b86a3669057f55edbf11eb8 Mon Sep 17 00:00:00 2001 From: rising-star92 Date: Fri, 21 Feb 2020 16:03:21 +0900 Subject: [PATCH] Standardize code style using Black This PR proposes to add [Black](https://github.com/psf/black) into Koalas. - `dev/linter-python` detects if codes are formatted or not by Black - `dev/reformat` script that reformats the codes by using black. - It still keeps 100 line length style, and ignores `E231` and `E203` at pycodestyle. See also https://github.com/psf/black/issues/429 and https://github.com/psf/black/issues/1202. Black automatically formats and violates several rules. - Update contribution guide. Resolve #755 Co-authored-by: Deepyaman Datta --- .github/workflows/master.yml | 4 + .travis.yml | 4 + databricks/__init__.py | 2 +- databricks/conftest.py | 20 +- databricks/koalas/__init__.py | 71 +- databricks/koalas/base.py | 94 +- databricks/koalas/config.py | 130 +- databricks/koalas/datetimes.py | 194 +- databricks/koalas/exceptions.py | 55 +- databricks/koalas/frame.py | 1843 ++++++----- databricks/koalas/generic.py | 179 +- databricks/koalas/groupby.py | 427 ++- databricks/koalas/indexes.py | 285 +- databricks/koalas/indexing.py | 238 +- databricks/koalas/internal.py | 261 +- databricks/koalas/missing/__init__.py | 22 +- databricks/koalas/missing/common.py | 44 +- databricks/koalas/missing/frame.py | 126 +- databricks/koalas/missing/groupby.py | 134 +- databricks/koalas/missing/indexes.py | 220 +- databricks/koalas/missing/series.py | 140 +- databricks/koalas/missing/window.py | 30 +- databricks/koalas/ml.py | 19 +- databricks/koalas/mlflow.py | 11 +- databricks/koalas/namespace.py | 572 ++-- databricks/koalas/numpy_compat.py | 194 +- databricks/koalas/plot.py | 830 +++-- databricks/koalas/series.py | 663 ++-- databricks/koalas/sql.py | 18 +- databricks/koalas/strings.py | 373 +-- databricks/koalas/testing/utils.py | 53 +- databricks/koalas/tests/test_config.py | 100 +- databricks/koalas/tests/test_csv.py | 204 +- databricks/koalas/tests/test_dataframe.py | 2805 ++++++++++------- .../koalas/tests/test_dataframe_conversion.py | 126 +- .../koalas/tests/test_dataframe_spark_io.py | 138 +- databricks/koalas/tests/test_default_index.py | 19 +- databricks/koalas/tests/test_expanding.py | 41 +- databricks/koalas/tests/test_frame_plot.py | 210 +- databricks/koalas/tests/test_groupby.py | 1668 ++++++---- databricks/koalas/tests/test_indexes.py | 636 ++-- databricks/koalas/tests/test_indexing.py | 547 ++-- databricks/koalas/tests/test_internal.py | 31 +- databricks/koalas/tests/test_namespace.py | 88 +- databricks/koalas/tests/test_numpy_compat.py | 50 +- .../koalas/tests/test_ops_on_diff_frames.py | 524 +-- databricks/koalas/tests/test_repr.py | 12 +- databricks/koalas/tests/test_reshape.py | 197 +- databricks/koalas/tests/test_rolling.py | 45 +- databricks/koalas/tests/test_series.py | 948 +++--- .../koalas/tests/test_series_conversion.py | 17 +- .../koalas/tests/test_series_datetime.py | 68 +- databricks/koalas/tests/test_series_plot.py | 218 +- databricks/koalas/tests/test_series_string.py | 184 +- databricks/koalas/tests/test_sql.py | 11 +- databricks/koalas/tests/test_stats.py | 94 +- databricks/koalas/tests/test_utils.py | 31 +- databricks/koalas/tests/test_window.py | 319 +- databricks/koalas/typedef.py | 80 +- databricks/koalas/usage_logging/__init__.py | 111 +- .../koalas/usage_logging/usage_logger.py | 76 +- databricks/koalas/utils.py | 155 +- databricks/koalas/version.py | 2 +- databricks/koalas/window.py | 97 +- dev/lint-python | 33 +- dev/reformat | 25 + dev/tox.ini | 2 +- docs/source/development/contributing.rst | 2 +- requirements-dev.txt | 3 + 69 files changed, 9988 insertions(+), 7185 deletions(-) create mode 100755 dev/reformat diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 03340ba..c889fc2 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -52,6 +52,10 @@ jobs: ./dev/download_travis_dependencies.sh sudo apt-get install xclip pip install setuptools + # Currently PIP with Python 3.5 removes Black in the requirements-dev.txt file + # as Black only works with Python 3.6+. This is hacky but we will drop + # Python 3.5 soon so it's fine. + sed -i '/black/d' requirements-dev.txt pip install -r requirements-dev.txt pip install pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION pip list diff --git a/.travis.yml b/.travis.yml index 0ad930a..c25a954 100644 --- a/.travis.yml +++ b/.travis.yml @@ -67,8 +67,12 @@ install: # Test PyPI installation at Python 3.5. This is also because # one of the dependency requires Python 3.6 Conda specifically. + # Currently PIP with Python 3.5 removes Black in the requirements-dev.txt file + # as Black only works with Python 3.6+. This is hacky but we will drop + # Python 3.5 soon so it's fine. - | if [[ $TRAVIS_PYTHON_VERSION == "3.5" ]]; then + sed -i '/black/d' requirements-dev.txt && \ pip install -r requirements-dev.txt && \ pip install pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION && \ pip list; diff --git a/databricks/__init__.py b/databricks/__init__.py index 7aef67c..1933dce 100644 --- a/databricks/__init__.py +++ b/databricks/__init__.py @@ -15,4 +15,4 @@ # # https://packaging.python.org/guides/packaging-namespace-packages/#pkgutil-style-namespace-packages -__path__ = __import__('pkgutil').extend_path(__path__, __name__) # type: ignore +__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore diff --git a/databricks/conftest.py b/databricks/conftest.py index ecc0101..afa209e 100644 --- a/databricks/conftest.py +++ b/databricks/conftest.py @@ -32,9 +32,7 @@ from databricks.koalas import utils -shared_conf = { - "spark.sql.shuffle.partitions": "4" -} +shared_conf = {"spark.sql.shuffle.partitions": "4"} # Initialize Spark session that should be used in doctests or unittests. # Delta requires Spark 2.4.2+. See # https://github.com/delta-io/delta#compatibility-with-apache-spark-versions. @@ -48,7 +46,7 @@ session = utils.default_session(shared_conf) -@pytest.fixture(scope='session', autouse=True) +@pytest.fixture(scope="session", autouse=True) def session_termination(): yield # Share one session across all the tests. Repeating starting and stopping sessions and contexts @@ -58,33 +56,33 @@ def session_termination(): @pytest.fixture(autouse=True) def add_ks(doctest_namespace): - doctest_namespace['ks'] = koalas + doctest_namespace["ks"] = koalas @pytest.fixture(autouse=True) def add_pd(doctest_namespace): if os.getenv("PANDAS_VERSION", None) is not None: assert pd.__version__ == os.getenv("PANDAS_VERSION") - doctest_namespace['pd'] = pd + doctest_namespace["pd"] = pd @pytest.fixture(autouse=True) def add_pa(doctest_namespace): if os.getenv("PYARROW_VERSION", None) is not None: assert pa.__version__ == os.getenv("PYARROW_VERSION") - doctest_namespace['pa'] = pa + doctest_namespace["pa"] = pa @pytest.fixture(autouse=True) def add_np(doctest_namespace): - doctest_namespace['np'] = numpy + doctest_namespace["np"] = numpy @pytest.fixture(autouse=True) def add_path(doctest_namespace): path = tempfile.mkdtemp() atexit.register(lambda: shutil.rmtree(path, ignore_errors=True)) - doctest_namespace['path'] = path + doctest_namespace["path"] = path @pytest.fixture(autouse=True) @@ -92,12 +90,12 @@ def add_db(doctest_namespace): db_name = "db%s" % str(uuid.uuid4()).replace("-", "") session.sql("CREATE DATABASE %s" % db_name) atexit.register(lambda: session.sql("DROP DATABASE IF EXISTS %s CASCADE" % db_name)) - doctest_namespace['db'] = db_name + doctest_namespace["db"] = db_name @pytest.fixture(autouse=os.getenv("KOALAS_USAGE_LOGGER", None) is not None) def add_caplog(caplog): - with caplog.at_level(logging.INFO, logger='databricks.koalas.usage_logger'): + with caplog.at_level(logging.INFO, logger="databricks.koalas.usage_logger"): yield diff --git a/databricks/koalas/__init__.py b/databricks/koalas/__init__.py index 6b96f56..e12439f 100644 --- a/databricks/koalas/__init__.py +++ b/databricks/koalas/__init__.py @@ -21,18 +21,23 @@ def assert_pyspark_version(): import logging + pyspark_ver = None try: import pyspark except ImportError: - raise ImportError('Unable to import pyspark - consider doing a pip install with [spark] ' - 'extra to install pyspark with pip') + raise ImportError( + "Unable to import pyspark - consider doing a pip install with [spark] " + "extra to install pyspark with pip" + ) else: - pyspark_ver = getattr(pyspark, '__version__') - if pyspark_ver is None or pyspark_ver < '2.4': + pyspark_ver = getattr(pyspark, "__version__") + if pyspark_ver is None or pyspark_ver < "2.4": logging.warning( - 'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.' - .format(pyspark_ver if pyspark_ver is not None else '')) + 'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'.format( + pyspark_ver if pyspark_ver is not None else "" + ) + ) assert_pyspark_version() @@ -40,8 +45,9 @@ def assert_pyspark_version(): import pyspark import pyarrow -if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and \ - LooseVersion(pyspark.__version__) < LooseVersion("3.0"): +if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and LooseVersion( + pyspark.__version__ +) < LooseVersion("3.0"): # This is required to support PyArrow 0.15 in PySpark versions lower than 3.0. # See SPARK-29367. os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1" @@ -53,10 +59,31 @@ def assert_pyspark_version(): from databricks.koalas.config import get_option, set_option, reset_option, options from databricks.koalas.groupby import NamedAgg -__all__ = ['read_csv', 'read_parquet', 'to_datetime', 'from_pandas', - 'get_dummies', 'DataFrame', 'Series', 'Index', 'MultiIndex', 'pandas_wraps', - 'sql', 'range', 'concat', 'melt', 'get_option', 'set_option', 'reset_option', - 'read_sql_table', 'read_sql_query', 'read_sql', 'options', 'option_context', 'NamedAgg'] +__all__ = [ + "read_csv", + "read_parquet", + "to_datetime", + "from_pandas", + "get_dummies", + "DataFrame", + "Series", + "Index", + "MultiIndex", + "pandas_wraps", + "sql", + "range", + "concat", + "melt", + "get_option", + "set_option", + "reset_option", + "read_sql_table", + "read_sql_query", + "read_sql", + "options", + "option_context", + "NamedAgg", +] def _auto_patch(): @@ -68,21 +95,29 @@ def _auto_patch(): if logger_module is not None: try: from databricks.koalas import usage_logging + usage_logging.attach(logger_module) except Exception as e: from pyspark.util import _exception_message - logger = logging.getLogger('databricks.koalas.usage_logger') - logger.warning('Tried to attach usage logger `{}`, but an exception was raised: {}' - .format(logger_module, _exception_message(e))) + + logger = logging.getLogger("databricks.koalas.usage_logger") + logger.warning( + "Tried to attach usage logger `{}`, but an exception was raised: {}".format( + logger_module, _exception_message(e) + ) + ) # Autopatching is on by default. x = os.getenv("SPARK_KOALAS_AUTOPATCH", "true") if x.lower() in ("true", "1", "enabled"): - logger = logging.getLogger('spark') - logger.info("Patching spark automatically. You can disable it by setting " - "SPARK_KOALAS_AUTOPATCH=false in your environment") + logger = logging.getLogger("spark") + logger.info( + "Patching spark automatically. You can disable it by setting " + "SPARK_KOALAS_AUTOPATCH=false in your environment" + ) from pyspark.sql import dataframe as df + df.DataFrame.to_koalas = DataFrame.to_koalas diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index e3b3881..fb89e7f 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -30,8 +30,11 @@ from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. from databricks.koalas import numpy_compat -from databricks.koalas.internal import (_InternalFrame, NATURAL_ORDER_COLUMN_NAME, - SPARK_DEFAULT_INDEX_NAME) +from databricks.koalas.internal import ( + _InternalFrame, + NATURAL_ORDER_COLUMN_NAME, + SPARK_DEFAULT_INDEX_NAME, +) from databricks.koalas.typedef import pandas_wraps, spark_type_to_pandas_dtype from databricks.koalas.utils import align_diff_series, scol_for, validate_axis from databricks.koalas.frame import DataFrame @@ -41,8 +44,10 @@ def booleanize_null(left_scol, scol, f): """ Booleanize Null in Spark Column """ - comp_ops = [getattr(spark.Column, '__{}__'.format(comp_op)) - for comp_op in ['eq', 'ne', 'lt', 'le', 'ge', 'gt']] + comp_ops = [ + getattr(spark.Column, "__{}__".format(comp_op)) + for comp_op in ["eq", "ne", "lt", "le", "ge", "gt"] + ] if f in comp_ops: # if `f` is "!=", fill null with True otherwise False @@ -69,6 +74,7 @@ def _column_op(f): :param self: Koalas Series :param args: arguments that the function `f` takes. """ + @wraps(f) def wrapper(self, *args): # It is possible for the function `f` takes other arguments than Spark Column. @@ -102,10 +108,11 @@ def wrapper(self, *args): for arg in args: # TODO: This is a quick hack to support NumPy type. We should revisit this. if isinstance(self.spark_type, LongType) and isinstance(arg, np.timedelta64): - new_args.append(float(arg / np.timedelta64(1, 's'))) + new_args.append(float(arg / np.timedelta64(1, "s"))) else: new_args.append(arg) return _column_op(f)(self, *new_args) + return wrapper @@ -114,9 +121,7 @@ def _wrap_accessor_spark(accessor, fn, return_type=None): Wrap an accessor property or method, e.g., Series.dt.date with a spark function. """ if return_type: - return _column_op( - lambda col: fn(col).cast(return_type) - )(accessor._data) + return _column_op(lambda col: fn(col).cast(return_type))(accessor._data) else: return _column_op(fn)(accessor._data) @@ -144,6 +149,7 @@ class IndexOpsMixin(object): def _with_new_scol(self, scol: spark.Column) -> IndexOpsMixin Creates new object with the new column """ + def __init__(self, internal: _InternalFrame, kdf): assert internal is not None assert kdf is not None and isinstance(kdf, DataFrame) @@ -166,7 +172,7 @@ def __add__(self, other): elif isinstance(other, str): return _column_op(F.concat)(self, F.lit(other)) else: - raise TypeError('string addition can only be applied to string series or literals.') + raise TypeError("string addition can only be applied to string series or literals.") else: return _column_op(spark.Column.__add__)(self, other) @@ -175,8 +181,8 @@ def __sub__(self, other): # behaviors. Pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. if isinstance(other, IndexOpsMixin) and isinstance(self.spark_type, TimestampType): if not isinstance(other.spark_type, TimestampType): - raise TypeError('datetime subtraction can only be applied to datetime series.') - return self.astype('bigint') - other.astype('bigint') + raise TypeError("datetime subtraction can only be applied to datetime series.") + return self.astype("bigint") - other.astype("bigint") else: return _column_op(spark.Column.__sub__)(self, other) @@ -199,11 +205,13 @@ def __radd__(self, other): def __floordiv__(self, other): return self._with_new_scol( - F.floor(_numpy_column_op(spark.Column.__div__)(self, other)._scol)) + F.floor(_numpy_column_op(spark.Column.__div__)(self, other)._scol) + ) def __rfloordiv__(self, other): return self._with_new_scol( - F.floor(_numpy_column_op(spark.Column.__rdiv__)(self, other)._scol)) + F.floor(_numpy_column_op(spark.Column.__rdiv__)(self, other)._scol) + ) __rmod__ = _column_op(spark.Column.__rmod__) __pow__ = _column_op(spark.Column.__pow__) @@ -229,12 +237,14 @@ def __rfloordiv__(self, other): def __array_ufunc__(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): # Try dunder methods first. result = numpy_compat.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) # After that, we try with PySpark APIs. if result is NotImplemented: result = numpy_compat.maybe_dispatch_ufunc_to_spark_func( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) if result is not NotImplemented: return result @@ -533,6 +543,7 @@ def astype(self, dtype): Int64Index([1, 2], dtype='int64', name='a') """ from databricks.koalas.typedef import as_spark_type + spark_type = as_spark_type(dtype) if not spark_type: raise ValueError("Type {} not understood".format(dtype)) @@ -583,9 +594,10 @@ def isin(self, values): Index([True, False, True, False, True, False], dtype='object', name='a') """ if not is_list_like(values): - raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]" - .format(values_type=type(values).__name__)) + raise TypeError( + "only list-like objects are allowed to be passed" + " to isin(), you passed a [{values_type}]".format(values_type=type(values).__name__) + ) return self._with_new_scol(self._scol.isin(list(values))).rename(self.name) @@ -617,6 +629,7 @@ def isnull(self): Index([False, False, True], dtype='object', name='a') """ from databricks.koalas.indexes import MultiIndex + if isinstance(self, MultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") if isinstance(self.spark_type, (FloatType, DoubleType)): @@ -661,6 +674,7 @@ def notnull(self): Index([True, True, False], dtype='object', name='a') """ from databricks.koalas.indexes import MultiIndex + if isinstance(self, MultiIndex): raise NotImplementedError("notna is not defined for MultiIndex") return (~self.isnull()).rename(self.name) @@ -724,7 +738,7 @@ def all(self, axis: Union[int, str] = 0) -> bool: # any and every was added as of Spark 3.0 # ret = sdf.select(F.expr("every(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0] # Here we use min as its alternative: - ret = sdf.select(F.min(F.coalesce(col.cast('boolean'), F.lit(True)))).collect()[0][0] + ret = sdf.select(F.min(F.coalesce(col.cast("boolean"), F.lit(True)))).collect()[0][0] if ret is None: return True else: @@ -787,7 +801,7 @@ def any(self, axis: Union[int, str] = 0) -> bool: # any and every was added as of Spark 3.0 # ret = sdf.select(F.expr("any(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0] # Here we use max as its alternative: - ret = sdf.select(F.max(F.coalesce(col.cast('boolean'), F.lit(False)))).collect()[0][0] + ret = sdf.select(F.max(F.coalesce(col.cast("boolean"), F.lit(False)))).collect()[0][0] if ret is None: return False else: @@ -845,11 +859,14 @@ def shift(self, periods=1, fill_value=None): def _shift(self, periods, fill_value, part_cols=()): if not isinstance(periods, int): - raise ValueError('periods should be an int; however, got [%s]' % type(periods)) + raise ValueError("periods should be an int; however, got [%s]" % type(periods)) col = self._scol - window = Window.partitionBy(*part_cols).orderBy(NATURAL_ORDER_COLUMN_NAME) \ + window = ( + Window.partitionBy(*part_cols) + .orderBy(NATURAL_ORDER_COLUMN_NAME) .rowsBetween(-periods, -periods) + ) lag_col = F.lag(col, periods).over(window) col = F.when(lag_col.isNull() | F.isnan(lag_col), fill_value).otherwise(lag_col) return self._with_new_scol(col).rename(self.name) @@ -1018,25 +1035,27 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, d sdf = sdf_dropna.groupby(scol_for(sdf_dropna, column_name).alias(index_name)).count() if sort: if ascending: - sdf = sdf.orderBy(F.col('count')) + sdf = sdf.orderBy(F.col("count")) else: - sdf = sdf.orderBy(F.col('count').desc()) + sdf = sdf.orderBy(F.col("count").desc()) if normalize: sum = sdf_dropna.count() - sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) + sdf = sdf.withColumn("count", F.col("count") / F.lit(sum)) column_labels = self._internal.column_labels if (column_labels[0] is None) or (None in column_labels[0]): - internal = _InternalFrame(sdf=sdf, - index_map=[(index_name, None)], - column_scols=[scol_for(sdf, 'count')]) + internal = _InternalFrame( + sdf=sdf, index_map=[(index_name, None)], column_scols=[scol_for(sdf, "count")] + ) else: - internal = _InternalFrame(sdf=sdf, - index_map=[(index_name, None)], - column_labels=column_labels, - column_scols=[scol_for(sdf, 'count')], - column_label_names=self._internal.column_label_names) + internal = _InternalFrame( + sdf=sdf, + index_map=[(index_name, None)], + column_labels=column_labels, + column_scols=[scol_for(sdf, "count")], + column_label_names=self._internal.column_label_names, + ) return _col(DataFrame(internal)) @@ -1100,6 +1119,9 @@ def _nunique(self, dropna=True, approx=False, rsd=0.05): if dropna: return count_fn(self._scol).alias(colname) else: - return (count_fn(self._scol) + - F.when(F.count(F.when(self._scol.isNull(), 1) - .otherwise(None)) >= 1, 1).otherwise(0)).alias(colname) + return ( + count_fn(self._scol) + + F.when(F.count(F.when(self._scol.isNull(), 1).otherwise(None)) >= 1, 1).otherwise( + 0 + ) + ).alias(colname) diff --git a/databricks/koalas/config.py b/databricks/koalas/config.py index b16c76f..1d6dd1a 100644 --- a/databricks/koalas/config.py +++ b/databricks/koalas/config.py @@ -26,7 +26,7 @@ from databricks.koalas.utils import default_session -__all__ = ['get_option', 'set_option', 'reset_option', 'options', 'option_context'] +__all__ = ["get_option", "set_option", "reset_option", "options", "option_context"] class Option: @@ -81,13 +81,14 @@ class Option: """ def __init__( - self, - *, - key: str, - doc: str, - default: Any, - types: Union[Tuple[type, ...], type] = str, - check_func: Tuple[Callable[[Any], bool], str] = (lambda v: True, "")): + self, + *, + key: str, + doc: str, + default: Any, + types: Union[Tuple[type, ...], type] = str, + check_func: Tuple[Callable[[Any], bool], str] = (lambda v: True, "") + ): self.key = key self.doc = doc self.default = default @@ -99,8 +100,10 @@ def validate(self, v: Any) -> None: Validate the given value and throw an exception with related information such as key. """ if not isinstance(v, self.types): - raise ValueError("The value for option '%s' was %s; however, expected types are " - "[%s]." % (self.key, type(v), str(self.types))) + raise ValueError( + "The value for option '%s' was %s; however, expected types are " + "[%s]." % (self.key, type(v), str(self.types)) + ) if not self.check_func[0](v): raise ValueError(self.check_func[1]) @@ -114,100 +117,114 @@ def validate(self, v: Any) -> None: # >>> show_options() _options = [ Option( - key='display.max_rows', + key="display.max_rows", doc=( "This sets the maximum number of rows koalas should output when printing out " "various output. For example, this value determines the number of rows to be " "shown at the repr() in a dataframe. Set `None` to unlimit the input length. " - "Default is 1000."), + "Default is 1000." + ), default=1000, types=(int, type(None)), check_func=( lambda v: v is None or v >= 0, - "'display.max_rows' should be greater than or equal to 0.")), - + "'display.max_rows' should be greater than or equal to 0.", + ), + ), Option( - key='compute.max_rows', + key="compute.max_rows", doc=( "'compute.max_rows' sets the limit of the current DataFrame. Set `None` to unlimit " "the input length. When the limit is set, it is executed by the shortcut by " "collecting the data into driver side, and then using pandas API. If the limit is " - "unset, the operation is executed by PySpark. Default is 1000."), + "unset, the operation is executed by PySpark. Default is 1000." + ), default=1000, types=(int, type(None)), check_func=( lambda v: v is None or v >= 0, - "'compute.max_rows' should be greater than or equal to 0.")), - + "'compute.max_rows' should be greater than or equal to 0.", + ), + ), Option( - key='compute.shortcut_limit', + key="compute.shortcut_limit", doc=( "'compute.shortcut_limit' sets the limit for a shortcut. " "It computes specified number of rows and use its schema. When the dataframe " - "length is larger than this limit, Koalas uses PySpark to compute."), + "length is larger than this limit, Koalas uses PySpark to compute." + ), default=1000, types=int, check_func=( - lambda v: v >= 0, "'compute.shortcut_limit' should be greater than or equal to 0.")), - + lambda v: v >= 0, + "'compute.shortcut_limit' should be greater than or equal to 0.", + ), + ), Option( - key='compute.ops_on_diff_frames', + key="compute.ops_on_diff_frames", doc=( "This determines whether or not to operate between two different dataframes. " "For example, 'combine_frames' function internally performs a join operation which " "can be expensive in general. So, if `compute.ops_on_diff_frames` variable is not " - "True, that method throws an exception."), + "True, that method throws an exception." + ), default=False, - types=bool), - + types=bool, + ), Option( - key='compute.default_index_type', - doc=( - "This sets the default index type: sequence, distributed and distributed-sequence."), - default='sequence', + key="compute.default_index_type", + doc=("This sets the default index type: sequence, distributed and distributed-sequence."), + default="sequence", types=str, check_func=( - lambda v: v in ('sequence', 'distributed', 'distributed-sequence'), - "Index type should be one of 'sequence', 'distributed', 'distributed-sequence'.")), - + lambda v: v in ("sequence", "distributed", "distributed-sequence"), + "Index type should be one of 'sequence', 'distributed', 'distributed-sequence'.", + ), + ), Option( - key='compute.ordered_head', + key="compute.ordered_head", doc=( "'compute.ordered_head' sets whether or not to operate head with natural ordering. " "Koalas does not guarantee the row ordering so `head` could return some rows from " "distributed partitions. If 'compute.ordered_head' is set to True, Koalas performs " - "natural ordering beforehand, but it will cause a performance overhead."), + "natural ordering beforehand, but it will cause a performance overhead." + ), default=False, - types=bool), - + types=bool, + ), Option( - key='plotting.max_rows', + key="plotting.max_rows", doc=( "'plotting.max_rows' sets the visual limit on top-n-based plots such as `plot.bar` " "and `plot.pie`. If it is set to 1000, the first 1000 data points will be used " - "for plotting. Default is 1000."), + "for plotting. Default is 1000." + ), default=1000, types=int, check_func=( lambda v: v is v >= 0, - "'plotting.max_rows' should be greater than or equal to 0.")), - + "'plotting.max_rows' should be greater than or equal to 0.", + ), + ), Option( - key='plotting.sample_ratio', + key="plotting.sample_ratio", doc=( "'plotting.sample_ratio' sets the proportion of data that will be plotted for sample-" "based plots such as `plot.line` and `plot.area`. " - "This option defaults to 'plotting.max_rows' option."), + "This option defaults to 'plotting.max_rows' option." + ), default=None, types=(float, type(None)), check_func=( lambda v: v is None or 1 >= v >= 0, - "'plotting.sample_ratio' should be 1 >= value >= 0.")), + "'plotting.sample_ratio' should be 1 >= value >= 0.", + ), + ), ] # type: List[Option] _options_dict = dict(zip((option.key for option in _options), _options)) # type: Dict[str, Option] -_key_format = 'koalas.{}'.format +_key_format = "koalas.{}".format class OptionError(AttributeError, KeyError): @@ -330,7 +347,7 @@ def option_context(*args): 1000 1000 """ if len(args) == 0 or len(args) % 2 != 0: - raise ValueError('Need to invoke as option_context(pat, val, [(pat, val), ...]).') + raise ValueError("Need to invoke as option_context(pat, val, [(pat, val), ...]).") opts = dict(zip(args[::2], args[1::2])) orig_opts = {key: get_option(key) for key in opts} try: @@ -346,7 +363,9 @@ def _check_option(key: str) -> None: if key not in _options_dict: raise OptionError( "No such option: '{}'. Available options are [{}]".format( - key, ", ".join(list(_options_dict.keys())))) + key, ", ".join(list(_options_dict.keys())) + ) + ) class DictWrapper: @@ -364,13 +383,16 @@ def __setattr__(self, key, val): canonical_key = prefix + key candidates = [ - k for k in d.keys() if all(x in k.split(".") for x in canonical_key.split("."))] + k for k in d.keys() if all(x in k.split(".") for x in canonical_key.split(".")) + ] if len(candidates) == 1 and candidates[0] == canonical_key: return set_option(canonical_key, val) else: raise OptionError( "No such option: '{}'. Available options are [{}]".format( - key, ", ".join(list(_options_dict.keys())))) + key, ", ".join(list(_options_dict.keys())) + ) + ) def __getattr__(self, key): prefix = object.__getattribute__(self, "prefix") @@ -380,13 +402,16 @@ def __getattr__(self, key): canonical_key = prefix + key candidates = [ - k for k in d.keys() if all(x in k.split(".") for x in canonical_key.split("."))] + k for k in d.keys() if all(x in k.split(".") for x in canonical_key.split(".")) + ] if len(candidates) == 1 and candidates[0] == canonical_key: return get_option(canonical_key) elif len(candidates) == 0: raise OptionError( "No such option: '{}'. Available options are [{}]".format( - key, ", ".join(list(_options_dict.keys())))) + key, ", ".join(list(_options_dict.keys())) + ) + ) else: return DictWrapper(d, canonical_key) @@ -398,8 +423,7 @@ def __dir__(self): candidates = d.keys() offset = 0 else: - candidates = [ - k for k in d.keys() if all(x in k.split(".") for x in prefix.split("."))] + candidates = [k for k in d.keys() if all(x in k.split(".") for x in prefix.split("."))] offset = len(prefix) + 1 # prefix (e.g. "compute.") to trim. return [c[offset:] for c in candidates] diff --git a/databricks/koalas/datetimes.py b/databricks/koalas/datetimes.py index cc172a5..865db05 100644 --- a/databricks/koalas/datetimes.py +++ b/databricks/koalas/datetimes.py @@ -21,9 +21,7 @@ import pandas as pd import pyspark.sql.functions as F -from pyspark.sql.types import ( - DateType, TimestampType, LongType, StringType, BooleanType -) +from pyspark.sql.types import DateType, TimestampType, LongType, StringType, BooleanType from databricks.koalas.base import _wrap_accessor_pandas, _wrap_accessor_spark @@ -34,104 +32,100 @@ class DatetimeMethods(object): """Date/Time methods for Koalas Series""" - def __init__(self, series: 'ks.Series'): + + def __init__(self, series: "ks.Series"): if not isinstance(series.spark_type, (DateType, TimestampType)): - raise ValueError( - "Cannot call DatetimeMethods on type {}" - .format(series.spark_type)) + raise ValueError("Cannot call DatetimeMethods on type {}".format(series.spark_type)) self._data = series self.name = self._data.name # Properties @property - def date(self) -> 'ks.Series': + def date(self) -> "ks.Series": """ Returns a Series of python datetime.date objects (namely, the date part of Timestamps without timezone information). """ # TODO: Hit a weird exception # syntax error in attribute name: `to_date(`start_date`)` with alias - return _wrap_accessor_spark( - self, lambda col: F.to_date(col)).alias(self.name) + return _wrap_accessor_spark(self, lambda col: F.to_date(col)).alias(self.name) @property - def time(self) -> 'ks.Series': + def time(self) -> "ks.Series": raise NotImplementedError() @property - def timetz(self) -> 'ks.Series': + def timetz(self) -> "ks.Series": raise NotImplementedError() @property - def year(self) -> 'ks.Series': + def year(self) -> "ks.Series": """ The year of the datetime. """ return _wrap_accessor_spark(self, F.year, LongType()).alias(self.name) @property - def month(self) -> 'ks.Series': + def month(self) -> "ks.Series": """ The month of the timestamp as January = 1 December = 12. """ return _wrap_accessor_spark(self, F.month, LongType()).alias(self.name) @property - def day(self) -> 'ks.Series': + def day(self) -> "ks.Series": """ The days of the datetime. """ - return _wrap_accessor_spark( - self, F.dayofmonth, LongType()).alias(self.name) + return _wrap_accessor_spark(self, F.dayofmonth, LongType()).alias(self.name) @property - def hour(self) -> 'ks.Series': + def hour(self) -> "ks.Series": """ The hours of the datetime. """ return _wrap_accessor_spark(self, F.hour, LongType()).alias(self.name) @property - def minute(self) -> 'ks.Series': + def minute(self) -> "ks.Series": """ The minutes of the datetime. """ return _wrap_accessor_spark(self, F.minute, LongType()).alias(self.name) @property - def second(self) -> 'ks.Series': + def second(self) -> "ks.Series": """ The seconds of the datetime. """ return _wrap_accessor_spark(self, F.second, LongType()).alias(self.name) @property - def microsecond(self) -> 'ks.Series': + def microsecond(self) -> "ks.Series": """ The microseconds of the datetime. """ - return _wrap_accessor_pandas( - self, lambda x: x.dt.microsecond, LongType()).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.dt.microsecond, LongType()).alias(self.name) @property - def nanosecond(self) -> 'ks.Series': + def nanosecond(self) -> "ks.Series": raise NotImplementedError() @property - def week(self) -> 'ks.Series': + def week(self) -> "ks.Series": """ The week ordinal of the year. """ return _wrap_accessor_spark(self, F.weekofyear, LongType()).alias(self.name) @property - def weekofyear(self) -> 'ks.Series': + def weekofyear(self) -> "ks.Series": return self.week weekofyear.__doc__ = week.__doc__ @property - def dayofweek(self) -> 'ks.Series': + def dayofweek(self) -> "ks.Series": """ The day of the week with Monday=0, Sunday=6. @@ -166,33 +160,36 @@ def dayofweek(self) -> 'ks.Series': 2017-01-08 6 Name: 0, dtype: int64 """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.dayofweek, LongType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.dayofweek, LongType()).alias( + self._data.name + ) @property - def weekday(self) -> 'ks.Series': + def weekday(self) -> "ks.Series": return self.dayofweek weekday.__doc__ = dayofweek.__doc__ @property - def dayofyear(self) -> 'ks.Series': + def dayofyear(self) -> "ks.Series": """ The ordinal day of the year. """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.dayofyear, LongType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.dayofyear, LongType()).alias( + self._data.name + ) @property - def quarter(self) -> 'ks.Series': + def quarter(self) -> "ks.Series": """ The quarter of the date. """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.quarter, LongType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.quarter, LongType()).alias( + self._data.name + ) @property - def is_month_start(self) -> 'ks.Series': + def is_month_start(self) -> "ks.Series": """ Indicates whether the date is the first day of the month. @@ -224,11 +221,12 @@ def is_month_start(self) -> 'ks.Series': 2 True Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.is_month_start, BooleanType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.is_month_start, BooleanType()).alias( + self._data.name + ) @property - def is_month_end(self) -> 'ks.Series': + def is_month_end(self) -> "ks.Series": """ Indicates whether the date is the last day of the month. @@ -260,11 +258,12 @@ def is_month_end(self) -> 'ks.Series': 2 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.is_month_end, BooleanType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.is_month_end, BooleanType()).alias( + self._data.name + ) @property - def is_quarter_start(self) -> 'ks.Series': + def is_quarter_start(self) -> "ks.Series": """ Indicator for whether the date is the first day of a quarter. @@ -307,11 +306,12 @@ def is_quarter_start(self) -> 'ks.Series': 3 False Name: dates, dtype: bool """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.is_quarter_start, BooleanType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.is_quarter_start, BooleanType()).alias( + self._data.name + ) @property - def is_quarter_end(self) -> 'ks.Series': + def is_quarter_end(self) -> "ks.Series": """ Indicator for whether the date is the last day of a quarter. @@ -354,11 +354,12 @@ def is_quarter_end(self) -> 'ks.Series': 3 False Name: dates, dtype: bool """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.is_quarter_end, BooleanType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.is_quarter_end, BooleanType()).alias( + self._data.name + ) @property - def is_year_start(self) -> 'ks.Series': + def is_year_start(self) -> "ks.Series": """ Indicate whether the date is the first day of a year. @@ -390,11 +391,12 @@ def is_year_start(self) -> 'ks.Series': 2 True Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.is_year_start, BooleanType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.is_year_start, BooleanType()).alias( + self._data.name + ) @property - def is_year_end(self) -> 'ks.Series': + def is_year_end(self) -> "ks.Series": """ Indicate whether the date is the last day of the year. @@ -426,11 +428,12 @@ def is_year_end(self) -> 'ks.Series': 2 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.is_year_end, BooleanType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.is_year_end, BooleanType()).alias( + self._data.name + ) @property - def is_leap_year(self) -> 'ks.Series': + def is_leap_year(self) -> "ks.Series": """ Boolean indicator if the date belongs to a leap year. @@ -462,41 +465,42 @@ def is_leap_year(self) -> 'ks.Series': 2 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.is_leap_year, BooleanType() - ).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.is_leap_year, BooleanType()).alias( + self._data.name + ) @property - def daysinmonth(self) -> 'ks.Series': + def daysinmonth(self) -> "ks.Series": """ The number of days in the month. """ - return _wrap_accessor_pandas( - self, lambda s: s.dt.daysinmonth, LongType()).alias(self._data.name) + return _wrap_accessor_pandas(self, lambda s: s.dt.daysinmonth, LongType()).alias( + self._data.name + ) @property - def days_in_month(self) -> 'ks.Series': + def days_in_month(self) -> "ks.Series": return self.daysinmonth days_in_month.__doc__ = daysinmonth.__doc__ # Methods - def tz_localize(self, tz) -> 'ks.Series': + def tz_localize(self, tz) -> "ks.Series": """ Localize tz-naive Datetime column to tz-aware Datetime column. """ # Neither tz-naive or tz-aware datetime exists in Spark raise NotImplementedError() - def tz_convert(self, tz) -> 'ks.Series': + def tz_convert(self, tz) -> "ks.Series": """ Convert tz-aware Datetime column from one time zone to another. """ # tz-aware datetime doesn't exist in Spark raise NotImplementedError() - def normalize(self) -> 'ks.Series': + def normalize(self) -> "ks.Series": """ Convert times to midnight. @@ -528,13 +532,11 @@ def normalize(self) -> 'ks.Series': 2 2012-03-31 Name: 0, dtype: datetime64[ns] """ - return _wrap_accessor_pandas( - self, - lambda x: x.dt.normalize(), - TimestampType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.dt.normalize(), TimestampType()).alias( + self.name + ) - def strftime(self, date_format) -> 'ks.Series': + def strftime(self, date_format) -> "ks.Series": """ Convert to a string Series using specified date_format. @@ -577,12 +579,10 @@ def strftime(self, date_format) -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.dt.strftime(date_format), - StringType() + self, lambda x: x.dt.strftime(date_format), StringType() ).alias(self.name) - def round(self, freq, *args, **kwargs) -> 'ks.Series': + def round(self, freq, *args, **kwargs) -> "ks.Series": """ Perform round operation on the data to the specified freq. @@ -632,12 +632,10 @@ def round(self, freq, *args, **kwargs) -> 'ks.Series': Name: 0, dtype: datetime64[ns] """ return _wrap_accessor_pandas( - self, - lambda x: x.dt.round(freq, *args, **kwargs), - TimestampType() + self, lambda x: x.dt.round(freq, *args, **kwargs), TimestampType() ).alias(self.name) - def floor(self, freq, *args, **kwargs) -> 'ks.Series': + def floor(self, freq, *args, **kwargs) -> "ks.Series": """ Perform floor operation on the data to the specified freq. @@ -686,13 +684,11 @@ def floor(self, freq, *args, **kwargs) -> 'ks.Series': 2 2018-01-01 12:00:00 Name: 0, dtype: datetime64[ns] """ - return _wrap_accessor_pandas( - self, - lambda x: x.dt.floor(freq), - TimestampType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.dt.floor(freq), TimestampType()).alias( + self.name + ) - def ceil(self, freq, *args, **kwargs) -> 'ks.Series': + def ceil(self, freq, *args, **kwargs) -> "ks.Series": """ Perform ceil operation on the data to the specified freq. @@ -741,13 +737,11 @@ def ceil(self, freq, *args, **kwargs) -> 'ks.Series': 2 2018-01-01 13:00:00 Name: 0, dtype: datetime64[ns] """ - return _wrap_accessor_pandas( - self, - lambda x: x.dt.ceil(freq), - TimestampType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.dt.ceil(freq), TimestampType()).alias( + self.name + ) - def month_name(self, locale=None) -> 'ks.Series': + def month_name(self, locale=None) -> "ks.Series": """ Return the month names of the series with specified locale. @@ -777,13 +771,11 @@ def month_name(self, locale=None) -> 'ks.Series': 2 March Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.dt.month_name(locale), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.dt.month_name(locale), StringType()).alias( + self.name + ) - def day_name(self, locale=None) -> 'ks.Series': + def day_name(self, locale=None) -> "ks.Series": """ Return the day names of the series with specified locale. @@ -813,8 +805,6 @@ def day_name(self, locale=None) -> 'ks.Series': 2 Wednesday Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.dt.day_name(locale), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.dt.day_name(locale), StringType()).alias( + self.name + ) diff --git a/databricks/koalas/exceptions.py b/databricks/koalas/exceptions.py index 58f2cd9..96268db 100644 --- a/databricks/koalas/exceptions.py +++ b/databricks/koalas/exceptions.py @@ -25,19 +25,21 @@ class SparkPandasIndexingError(Exception): def code_change_hint(pandas_function, spark_target_function): if pandas_function is not None and spark_target_function is not None: - return "You are trying to use pandas function {}, use spark function {}" \ - .format(pandas_function, spark_target_function) + return "You are trying to use pandas function {}, use spark function {}".format( + pandas_function, spark_target_function + ) elif pandas_function is not None and spark_target_function is None: - return ("You are trying to use pandas function {}, checkout the spark " - "user guide to find a relevant function").format(pandas_function) + return ( + "You are trying to use pandas function {}, checkout the spark " + "user guide to find a relevant function" + ).format(pandas_function) elif pandas_function is None and spark_target_function is not None: return "Use spark function {}".format(spark_target_function) - else: # both none + else: # both none return "Checkout the spark user guide to find a relevant function" class SparkPandasNotImplementedError(NotImplementedError): - def __init__(self, pandas_function=None, spark_target_function=None, description=""): self.pandas_source = pandas_function self.spark_target = spark_target_function @@ -50,39 +52,50 @@ def __init__(self, pandas_function=None, spark_target_function=None, description class PandasNotImplementedError(NotImplementedError): - - def __init__(self, class_name, method_name=None, arg_name=None, property_name=None, - deprecated=False, reason=""): + def __init__( + self, + class_name, + method_name=None, + arg_name=None, + property_name=None, + deprecated=False, + reason="", + ): assert (method_name is None) != (property_name is None) self.class_name = class_name self.method_name = method_name self.arg_name = arg_name if method_name is not None: if arg_name is not None: - msg = "The method `{0}.{1}()` does not support `{2}` parameter. {3}" \ - .format(class_name, method_name, arg_name, reason) + msg = "The method `{0}.{1}()` does not support `{2}` parameter. {3}".format( + class_name, method_name, arg_name, reason + ) else: if deprecated: - msg = ("The method `{0}.{1}()` is deprecated in pandas and will therefore " + - "not be supported in Koalas. {2}") \ - .format(class_name, method_name, reason) + msg = ( + "The method `{0}.{1}()` is deprecated in pandas and will therefore " + + "not be supported in Koalas. {2}" + ).format(class_name, method_name, reason) else: if reason == "": reason = " yet." else: reason = ". " + reason - msg = "The method `{0}.{1}()` is not implemented{2}" \ - .format(class_name, method_name, reason) + msg = "The method `{0}.{1}()` is not implemented{2}".format( + class_name, method_name, reason + ) else: if deprecated: - msg = ("The property `{0}.{1}()` is deprecated in pandas and will therefore " + - "not be supported in Koalas. {2}") \ - .format(class_name, property_name, reason) + msg = ( + "The property `{0}.{1}()` is deprecated in pandas and will therefore " + + "not be supported in Koalas. {2}" + ).format(class_name, property_name, reason) else: if reason == "": reason = " yet." else: reason = ". " + reason - msg = "The property `{0}.{1}()` is not implemented{2}" \ - .format(class_name, property_name, reason) + msg = "The property `{0}.{1}()` is not implemented{2}".format( + class_name, property_name, reason + ) super(NotImplementedError, self).__init__(msg) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index d3e9a2c..0209a2a 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -31,7 +31,8 @@ import numpy as np import pandas as pd from pandas.api.types import is_list_like, is_dict_like -if LooseVersion(pd.__version__) >= LooseVersion('0.24'): + +if LooseVersion(pd.__version__) >= LooseVersion("0.24"): from pandas.core.dtypes.common import infer_dtype_from_object else: from pandas.core.dtypes.common import _get_dtype_from_object as infer_dtype_from_object @@ -40,17 +41,35 @@ from pyspark import sql as spark from pyspark.sql import functions as F, Column from pyspark.sql.functions import pandas_udf -from pyspark.sql.types import (BooleanType, ByteType, DecimalType, DoubleType, FloatType, - IntegerType, LongType, NumericType, ShortType, StructType, - StructField) +from pyspark.sql.types import ( + BooleanType, + ByteType, + DecimalType, + DoubleType, + FloatType, + IntegerType, + LongType, + NumericType, + ShortType, + StructType, + StructField, +) from pyspark.sql.window import Window from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. -from databricks.koalas.utils import (validate_arguments_and_invoke_function, align_diff_frames, - validate_bool_kwarg) +from databricks.koalas.utils import ( + validate_arguments_and_invoke_function, + align_diff_frames, + validate_bool_kwarg, +) from databricks.koalas.generic import _Frame -from databricks.koalas.internal import (_InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME, - SPARK_INDEX_NAME_FORMAT, SPARK_DEFAULT_INDEX_NAME) +from databricks.koalas.internal import ( + _InternalFrame, + HIDDEN_COLUMNS, + NATURAL_ORDER_COLUMN_NAME, + SPARK_INDEX_NAME_FORMAT, + SPARK_DEFAULT_INDEX_NAME, +) from databricks.koalas.missing.frame import _MissingPandasLikeDataFrame from databricks.koalas.ml import corr from databricks.koalas.utils import column_labels_level, name_like_string, scol_for, validate_axis @@ -63,7 +82,8 @@ # Two patterns basically seek the footer string from Pandas' REPR_PATTERN = re.compile(r"\n\n\[(?P[0-9]+) rows x (?P[0-9]+) columns\]$") REPR_HTML_PATTERN = re.compile( - r"\n\(?P[0-9]+) rows × (?P[0-9]+) columns\<\/p\>\n\<\/div\>$") + r"\n\(?P[0-9]+) rows × (?P[0-9]+) columns\<\/p\>\n\<\/div\>$" +) _flex_doc_FRAME = """ @@ -256,7 +276,7 @@ rectangle 16.0 2.348543e+108 """ -T = TypeVar('T') +T = TypeVar("T") if (3, 5) <= sys.version_info < (3, 7): @@ -350,6 +370,7 @@ class DataFrame(_Frame, Generic[T]): 3 8 7 9 1 0 4 2 5 4 3 9 """ + def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): if isinstance(data, _InternalFrame): assert index is None @@ -452,7 +473,7 @@ def _reduce_for_stat_function(self, sfun, name, axis=None, numeric_only=False): col_type = self._internal.spark_type_for(label) is_numeric_or_boolean = isinstance(col_type, (NumericType, BooleanType)) - min_or_max = sfun.__name__ in ('min', 'max') + min_or_max = sfun.__name__ in ("min", "max") keep_column = not numeric_only or is_numeric_or_boolean or min_or_max if keep_column: @@ -460,7 +481,7 @@ def _reduce_for_stat_function(self, sfun, name, axis=None, numeric_only=False): # Stat functions cannot be used with boolean values by default # Thus, cast to integer (true to 1 and false to 0) # Exclude the min and max methods though since those work with booleans - col_sdf = col_sdf.cast('integer') + col_sdf = col_sdf.cast("integer") if num_args == 1: # Only pass in the column if sfun accepts only one arg col_sdf = sfun(col_sdf) @@ -512,15 +533,18 @@ def _apply_series_op(self, op): # Arithmetic Operators def _map_series_op(self, op, other): from databricks.koalas.base import IndexOpsMixin - if not isinstance(other, DataFrame) and (isinstance(other, IndexOpsMixin) or - is_sequence(other)): + + if not isinstance(other, DataFrame) and ( + isinstance(other, IndexOpsMixin) or is_sequence(other) + ): raise ValueError( "%s with a sequence is currently not supported; " - "however, got %s." % (op, type(other))) + "however, got %s." % (op, type(other)) + ) if isinstance(other, DataFrame) and self is not other: if self._internal.column_labels_level != other._internal.column_labels_level: - raise ValueError('cannot join with no overlapping index names') + raise ValueError("cannot join with no overlapping index names") # Different DataFrames def apply_op(kdf, this_column_labels, that_column_labels): @@ -600,28 +624,22 @@ def kde(self, bw_method=None, ind=None, **kwds): kde.__doc__ = KoalasFramePlotMethods.kde.__doc__ add.__doc__ = _flex_doc_FRAME.format( - desc='Addition', - op_name='+', - equiv='dataframe + other', - reverse='radd') + desc="Addition", op_name="+", equiv="dataframe + other", reverse="radd" + ) def radd(self, other): return other + self radd.__doc__ = _flex_doc_FRAME.format( - desc='Addition', - op_name="+", - equiv="other + dataframe", - reverse='add') + desc="Addition", op_name="+", equiv="other + dataframe", reverse="add" + ) def div(self, other): return self / other div.__doc__ = _flex_doc_FRAME.format( - desc='Floating division', - op_name="/", - equiv="dataframe / other", - reverse='rdiv') + desc="Floating division", op_name="/", equiv="dataframe / other", reverse="rdiv" + ) divide = div @@ -629,37 +647,29 @@ def rdiv(self, other): return other / self rdiv.__doc__ = _flex_doc_FRAME.format( - desc='Floating division', - op_name="/", - equiv="other / dataframe", - reverse='div') + desc="Floating division", op_name="/", equiv="other / dataframe", reverse="div" + ) def truediv(self, other): return self / other truediv.__doc__ = _flex_doc_FRAME.format( - desc='Floating division', - op_name="/", - equiv="dataframe / other", - reverse='rtruediv') + desc="Floating division", op_name="/", equiv="dataframe / other", reverse="rtruediv" + ) def rtruediv(self, other): return other / self rtruediv.__doc__ = _flex_doc_FRAME.format( - desc='Floating division', - op_name="/", - equiv="other / dataframe", - reverse='truediv') + desc="Floating division", op_name="/", equiv="other / dataframe", reverse="truediv" + ) def mul(self, other): return self * other mul.__doc__ = _flex_doc_FRAME.format( - desc='Multiplication', - op_name="*", - equiv="dataframe * other", - reverse='rmul') + desc="Multiplication", op_name="*", equiv="dataframe * other", reverse="rmul" + ) multiply = mul @@ -667,19 +677,15 @@ def rmul(self, other): return other * self rmul.__doc__ = _flex_doc_FRAME.format( - desc='Multiplication', - op_name="*", - equiv="other * dataframe", - reverse='mul') + desc="Multiplication", op_name="*", equiv="other * dataframe", reverse="mul" + ) def sub(self, other): return self - other sub.__doc__ = _flex_doc_FRAME.format( - desc='Subtraction', - op_name="-", - equiv="dataframe - other", - reverse='rsub') + desc="Subtraction", op_name="-", equiv="dataframe - other", reverse="rsub" + ) subtract = sub @@ -687,64 +693,50 @@ def rsub(self, other): return other - self rsub.__doc__ = _flex_doc_FRAME.format( - desc='Subtraction', - op_name="-", - equiv="other - dataframe", - reverse='sub') + desc="Subtraction", op_name="-", equiv="other - dataframe", reverse="sub" + ) def mod(self, other): return self % other mod.__doc__ = _flex_doc_FRAME.format( - desc='Modulo', - op_name='%', - equiv='dataframe % other', - reverse='rmod') + desc="Modulo", op_name="%", equiv="dataframe % other", reverse="rmod" + ) def rmod(self, other): return other % self rmod.__doc__ = _flex_doc_FRAME.format( - desc='Modulo', - op_name='%', - equiv='other % dataframe', - reverse='mod') + desc="Modulo", op_name="%", equiv="other % dataframe", reverse="mod" + ) def pow(self, other): return self ** other pow.__doc__ = _flex_doc_FRAME.format( - desc='Exponential power of series', - op_name='**', - equiv='dataframe ** other', - reverse='rpow') + desc="Exponential power of series", op_name="**", equiv="dataframe ** other", reverse="rpow" + ) def rpow(self, other): return other ** self rpow.__doc__ = _flex_doc_FRAME.format( - desc='Exponential power', - op_name='**', - equiv='other ** dataframe', - reverse='pow') + desc="Exponential power", op_name="**", equiv="other ** dataframe", reverse="pow" + ) def floordiv(self, other): return self // other floordiv.__doc__ = _flex_doc_FRAME.format( - desc='Integer division', - op_name='//', - equiv='dataframe // other', - reverse='rfloordiv') + desc="Integer division", op_name="//", equiv="dataframe // other", reverse="rfloordiv" + ) def rfloordiv(self, other): return other // self rfloordiv.__doc__ = _flex_doc_FRAME.format( - desc='Integer division', - op_name='//', - equiv='other // dataframe', - reverse='floordiv') + desc="Integer division", op_name="//", equiv="other // dataframe", reverse="floordiv" + ) # Comparison Operators def __eq__(self, other): @@ -990,19 +982,26 @@ def aggregate(self, func: Union[List[str], Dict[str, List[str]]]): if isinstance(func, list): if all((isinstance(f, str) for f in func)): - func = dict([ - (column, func) for column in self.columns]) + func = dict([(column, func) for column in self.columns]) else: - raise ValueError("If the given function is a list, it " - "should only contains function names as strings.") - - if not isinstance(func, dict) or \ - not all(isinstance(key, str) and - (isinstance(value, str) or - isinstance(value, list) and all(isinstance(v, str) for v in value)) - for key, value in func.items()): - raise ValueError("aggs must be a dict mapping from column name (string) to aggregate " - "functions (list of strings).") + raise ValueError( + "If the given function is a list, it " + "should only contains function names as strings." + ) + + if not isinstance(func, dict) or not all( + isinstance(key, str) + and ( + isinstance(value, str) + or isinstance(value, list) + and all(isinstance(v, str) for v in value) + ) + for key, value in func.items() + ): + raise ValueError( + "aggs must be a dict mapping from column name (string) to aggregate " + "functions (list of strings)." + ) kdf = DataFrame(GroupBy._spark_groupby(self, func)) # type: DataFrame @@ -1028,7 +1027,7 @@ def aggregate(self, func: Union[List[str], Dict[str, List[str]]]): agg = aggregate - def corr(self, method='pearson'): + def corr(self, method="pearson"): """ Compute pairwise correlation of columns, excluding NA/null values. @@ -1161,8 +1160,11 @@ def iterrows(self): internal_data_columns = self._internal.data_columns def extract_kv_from_spark_row(row): - k = row[internal_index_columns[0]] if len(internal_index_columns) == 1 else tuple( - row[c] for c in internal_index_columns) + k = ( + row[internal_index_columns[0]] + if len(internal_index_columns) == 1 + else tuple(row[c] for c in internal_index_columns) + ) v = [row[c] for c in internal_data_columns] return k, v @@ -1246,13 +1248,34 @@ def to_clipboard(self, excel=True, sep=None, **kwargs): args = locals() kdf = self return validate_arguments_and_invoke_function( - kdf._to_internal_pandas(), self.to_clipboard, pd.DataFrame.to_clipboard, args) - - def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, - na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, - justify=None, max_rows=None, max_cols=None, show_dimensions=False, decimal='.', - bold_rows=True, classes=None, escape=True, notebook=False, border=None, - table_id=None, render_links=False): + kdf._to_internal_pandas(), self.to_clipboard, pd.DataFrame.to_clipboard, args + ) + + def to_html( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + max_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + bold_rows=True, + classes=None, + escape=True, + notebook=False, + border=None, + table_id=None, + render_links=False, + ): """ Render a DataFrame as an HTML table. @@ -1345,13 +1368,28 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, index=Tru kdf = self return validate_arguments_and_invoke_function( - kdf._to_internal_pandas(), self.to_html, pd.DataFrame.to_html, args) - - def to_string(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, justify=None, - max_rows=None, max_cols=None, show_dimensions=False, - decimal='.', line_width=None): + kdf._to_internal_pandas(), self.to_html, pd.DataFrame.to_html, args + ) + + def to_string( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + max_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + line_width=None, + ): """ Render a DataFrame to a console-friendly tabular output. @@ -1445,9 +1483,10 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True, kdf = self return validate_arguments_and_invoke_function( - kdf._to_internal_pandas(), self.to_string, pd.DataFrame.to_string, args) + kdf._to_internal_pandas(), self.to_string, pd.DataFrame.to_string, args + ) - def to_dict(self, orient='dict', into=dict): + def to_dict(self, orient="dict", into=dict): """ Convert the DataFrame to a dictionary. @@ -1541,12 +1580,31 @@ def to_dict(self, orient='dict', into=dict): args = locals() kdf = self return validate_arguments_and_invoke_function( - kdf._to_internal_pandas(), self.to_dict, pd.DataFrame.to_dict, args) - - def to_latex(self, buf=None, columns=None, col_space=None, header=True, index=True, - na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, - bold_rows=False, column_format=None, longtable=None, escape=None, encoding=None, - decimal='.', multicolumn=None, multicolumn_format=None, multirow=None): + kdf._to_internal_pandas(), self.to_dict, pd.DataFrame.to_dict, args + ) + + def to_latex( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal=".", + multicolumn=None, + multicolumn_format=None, + multirow=None, + ): r""" Render an object to a LaTeX tabular environment table. @@ -1640,7 +1698,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, index=Tr args = locals() kdf = self return validate_arguments_and_invoke_function( - kdf._to_internal_pandas(), self.to_latex, pd.DataFrame.to_latex, args) + kdf._to_internal_pandas(), self.to_latex, pd.DataFrame.to_latex, args + ) # TODO: enable doctests once we drop Spark 2.3.x (due to type coercion logic # when creating arrays) @@ -1753,8 +1812,10 @@ def transpose(self): "Current DataFrame has more then the given limit {0} rows. " "Please set 'compute.max_rows' by using 'databricks.koalas.config.set_option' " "to retrieve to retrieve more than {0} rows. Note that, before changing the " - "'compute.max_rows', this operation is considerably expensive." - .format(max_compute_count)) + "'compute.max_rows', this operation is considerably expensive.".format( + max_compute_count + ) + ) return DataFrame(pdf.transpose()) # Explode the data to be pairs. @@ -1784,36 +1845,55 @@ def transpose(self): # |{"a":["y3","z3"]}| a| x2| 2| # |{"a":["y3","z3"]}| b| x3| 1| # +-----------------+-----------------+-----------------+-----+ - pairs = F.explode(F.array(*[ - F.struct( - [F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) for i, col in enumerate(label)] + - [self[label]._scol.alias("value")] - ) for label in self._internal.column_labels])) + pairs = F.explode( + F.array( + *[ + F.struct( + [ + F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) + for i, col in enumerate(label) + ] + + [self[label]._scol.alias("value")] + ) + for label in self._internal.column_labels + ] + ) + ) exploded_df = self._sdf.withColumn("pairs", pairs).select( - [F.to_json(F.struct(F.array([scol.cast('string') - for scol in self._internal.index_scols]) - .alias('a'))).alias('index'), - F.col("pairs.*")]) + [ + F.to_json( + F.struct( + F.array([scol.cast("string") for scol in self._internal.index_scols]).alias( + "a" + ) + ) + ).alias("index"), + F.col("pairs.*"), + ] + ) # After that, executes pivot with key and its index column. # Note that index column should contain unique values since column names # should be unique. - internal_index_columns = [SPARK_INDEX_NAME_FORMAT(i) - for i in range(self._internal.column_labels_level)] - pivoted_df = exploded_df.groupBy(internal_index_columns).pivot('index') + internal_index_columns = [ + SPARK_INDEX_NAME_FORMAT(i) for i in range(self._internal.column_labels_level) + ] + pivoted_df = exploded_df.groupBy(internal_index_columns).pivot("index") transposed_df = pivoted_df.agg(F.first(F.col("value"))) - new_data_columns = list(filter(lambda x: x not in internal_index_columns, - transposed_df.columns)) + new_data_columns = list( + filter(lambda x: x not in internal_index_columns, transposed_df.columns) + ) internal = self._internal.copy( sdf=transposed_df, index_map=[(col, None) for col in internal_index_columns], - column_labels=[tuple(json.loads(col)['a']) for col in new_data_columns], + column_labels=[tuple(json.loads(col)["a"]) for col in new_data_columns], column_scols=[scol_for(transposed_df, col) for col in new_data_columns], - column_label_names=None) + column_label_names=None, + ) return DataFrame(internal) @@ -1918,7 +1998,8 @@ def map_in_pandas(self, func): if not isinstance(applied, pd.DataFrame): raise ValueError( "The given function should return a frame; however, " - "the return type was %s." % type(applied)) + "the return type was %s." % type(applied) + ) kdf = ks.DataFrame(applied) if len(pdf) <= limit: return kdf @@ -1926,8 +2007,8 @@ def map_in_pandas(self, func): return_schema = kdf._internal._sdf.drop(*HIDDEN_COLUMNS).schema sdf = GroupBy._spark_group_map_apply( - self, func, (F.spark_partition_id(),), - return_schema, retain_index=True) + self, func, (F.spark_partition_id(),), return_schema, retain_index=True + ) # If schema is inferred, we can restore indexes too. internal = kdf._internal.with_new_sdf(sdf) @@ -1937,11 +2018,12 @@ def map_in_pandas(self, func): if not is_return_dataframe: raise TypeError( "The given function should specify a frame as its type " - "hints; however, the return type was %s." % return_sig) + "hints; however, the return type was %s." % return_sig + ) sdf = GroupBy._spark_group_map_apply( - self, func, (F.spark_partition_id(),), - return_schema, retain_index=False) + self, func, (F.spark_partition_id(),), return_schema, retain_index=False + ) # Otherwise, it loses index. internal = _InternalFrame(sdf=sdf, index_map=None) @@ -2128,8 +2210,8 @@ def apply_func(pdf): return_schema = kdf._internal._sdf.drop(*HIDDEN_COLUMNS).schema sdf = GroupBy._spark_group_map_apply( - self, apply_func, (F.spark_partition_id(),), - return_schema, retain_index=True) + self, apply_func, (F.spark_partition_id(),), return_schema, retain_index=True + ) # If schema is inferred, we can restore indexes too. internal = kdf._internal.with_new_sdf(sdf) @@ -2142,7 +2224,8 @@ def apply_func(pdf): raise TypeError( "The given function should specify a scalar or a series as its type " "hints when axis is 0 or 'index'; however, the return type " - "was %s" % return_sig) + "was %s" % return_sig + ) fields_types = zip(self.columns, [return_schema] * len(self.columns)) return_schema = StructType([StructField(c, t) for c, t in fields_types]) elif require_column_axis: @@ -2150,15 +2233,16 @@ def apply_func(pdf): raise TypeError( "The given function should specify a scalar or a frame as its type " "hints when axis is 1 or 'column'; however, the return type " - "was %s" % return_sig) + "was %s" % return_sig + ) else: # any axis is fine. should_return_series = True return_schema = StructType([StructField("0", return_schema)]) sdf = GroupBy._spark_group_map_apply( - self, apply_func, (F.spark_partition_id(),), - return_schema, retain_index=False) + self, apply_func, (F.spark_partition_id(),), return_schema, retain_index=False + ) # Otherwise, it loses index. internal = _InternalFrame(sdf=sdf, index_map=None) @@ -2268,10 +2352,11 @@ def transform(self, func): applied = [] for input_label, output_label in zip( - self._internal.column_labels, kdf._internal.column_labels): + self._internal.column_labels, kdf._internal.column_labels + ): wrapped = ks.pandas_wraps( - func, - return_col=as_python_type(kdf[output_label].spark_type)) + func, return_col=as_python_type(kdf[output_label].spark_type) + ) applied.append(wrapped(self[input_label]).rename(input_label)) internal = self._internal.with_new_columns(applied) @@ -2424,11 +2509,14 @@ class locomotion mammal walks 4 0 """ from databricks.koalas.series import _col + if not isinstance(key, (str, tuple)): raise ValueError("'key' should be string or tuple that contains strings") if not all(isinstance(index, str) for index in key): - raise ValueError("'key' should have index names as only strings " - "or a tuple that contain index names as only strings") + raise ValueError( + "'key' should have index names as only strings " + "or a tuple that contain index names as only strings" + ) axis = validate_axis(axis) if axis != 0: @@ -2436,18 +2524,22 @@ class locomotion if isinstance(key, str): key = (key,) if len(key) > len(self._internal.index_scols): - raise KeyError("Key length ({}) exceeds index depth ({})" - .format(len(key), len(self._internal.index_scols))) + raise KeyError( + "Key length ({}) exceeds index depth ({})".format( + len(key), len(self._internal.index_scols) + ) + ) if level is None: level = 0 - scols = self._internal.scols[:level] + self._internal.scols[level+len(key):] - rows = [self._internal.scols[lvl] == index - for lvl, index in enumerate(key, level)] + scols = self._internal.scols[:level] + self._internal.scols[level + len(key) :] + rows = [self._internal.scols[lvl] == index for lvl, index in enumerate(key, level)] - sdf = self._sdf.select(scols + list(HIDDEN_COLUMNS)) \ - .drop(NATURAL_ORDER_COLUMN_NAME) \ + sdf = ( + self._sdf.select(scols + list(HIDDEN_COLUMNS)) + .drop(NATURAL_ORDER_COLUMN_NAME) .filter(reduce(lambda x, y: x & y, rows)) + ) if len(key) == len(self._internal.index_scols): result = _col(DataFrame(_InternalFrame(sdf=sdf, index_map=None)).T) @@ -2455,8 +2547,9 @@ class locomotion else: internal = self._internal.copy( sdf=sdf, - index_map=self._internal.index_map[:level] + - self._internal.index_map[level+len(key):]) + index_map=self._internal.index_map[:level] + + self._internal.index_map[level + len(key) :], + ) result = DataFrame(internal) return result @@ -2570,18 +2663,25 @@ def where(self, cond, other=np.nan): """ from databricks.koalas.series import Series - tmp_cond_col_name = '__tmp_cond_col_{}__'.format - tmp_other_col_name = '__tmp_other_col_{}__'.format + tmp_cond_col_name = "__tmp_cond_col_{}__".format + tmp_other_col_name = "__tmp_other_col_{}__".format kdf = self.copy() - tmp_cond_col_names = [tmp_cond_col_name(name_like_string(label)) - for label in self._internal.column_labels] + tmp_cond_col_names = [ + tmp_cond_col_name(name_like_string(label)) for label in self._internal.column_labels + ] if isinstance(cond, DataFrame): - cond = cond[[(cond._internal.scol_for(label) - if label in cond._internal.column_labels else F.lit(False)).alias(name) - for label, name - in zip(self._internal.column_labels, tmp_cond_col_names)]] + cond = cond[ + [ + ( + cond._internal.scol_for(label) + if label in cond._internal.column_labels + else F.lit(False) + ).alias(name) + for label, name in zip(self._internal.column_labels, tmp_cond_col_names) + ] + ] kdf[tmp_cond_col_names] = cond elif isinstance(cond, Series): cond = cond.to_frame() @@ -2590,19 +2690,26 @@ def where(self, cond, other=np.nan): else: raise ValueError("type of cond must be a DataFrame or Series") - tmp_other_col_names = [tmp_other_col_name(name_like_string(label)) - for label in self._internal.column_labels] + tmp_other_col_names = [ + tmp_other_col_name(name_like_string(label)) for label in self._internal.column_labels + ] if isinstance(other, DataFrame): - other = other[[(other._internal.scol_for(label) - if label in other._internal.column_labels else F.lit(np.nan)) - .alias(name) - for label, name - in zip(self._internal.column_labels, tmp_other_col_names)]] + other = other[ + [ + ( + other._internal.scol_for(label) + if label in other._internal.column_labels + else F.lit(np.nan) + ).alias(name) + for label, name in zip(self._internal.column_labels, tmp_other_col_names) + ] + ] kdf[tmp_other_col_names] = other elif isinstance(other, Series): other = other.to_frame() - other = other[[other._internal.column_scols[0].alias(name) - for name in tmp_other_col_names]] + other = other[ + [other._internal.column_scols[0].alias(name) for name in tmp_other_col_names] + ] kdf[tmp_other_col_names] = other else: for label in self._internal.column_labels: @@ -2622,15 +2729,14 @@ def where(self, cond, other=np.nan): column_scols = [] for label in self._internal.column_labels: column_scols.append( - F.when( - kdf[tmp_cond_col_name(name_like_string(label))]._scol, - kdf[label]._scol - ).otherwise( - kdf[tmp_other_col_name(name_like_string(label))]._scol - ).alias(kdf._internal.column_name_for(label))) + F.when(kdf[tmp_cond_col_name(name_like_string(label))]._scol, kdf[label]._scol) + .otherwise(kdf[tmp_other_col_name(name_like_string(label))]._scol) + .alias(kdf._internal.column_name_for(label)) + ) - return DataFrame(kdf._internal.with_new_columns(column_scols, - column_labels=self._internal.column_labels)) + return DataFrame( + kdf._internal.with_new_columns(column_scols, column_labels=self._internal.column_labels) + ) def mask(self, cond, other=np.nan): """ @@ -2705,6 +2811,7 @@ def mask(self, cond, other=np.nan): >>> reset_option("compute.ops_on_diff_frames") """ from databricks.koalas.series import Series + if not isinstance(cond, (DataFrame, Series)): raise ValueError("type of cond must be a DataFrame or Series") @@ -2722,6 +2829,7 @@ def index(self): Index """ from databricks.koalas.indexes import Index, MultiIndex + if len(self._internal.index_map) == 1: return Index(self) else: @@ -2759,11 +2867,10 @@ def style(self): >>> ks.range(1001).style # doctest: +ELLIPSIS """ - max_results = get_option('compute.max_rows') + max_results = get_option("compute.max_rows") pdf = self.head(max_results + 1).to_pandas() if len(pdf) > max_results: - warnings.warn( - "'style' property will only use top %s rows." % max_results, UserWarning) + warnings.warn("'style' property will only use top %s rows." % max_results, UserWarning) return pdf.head(max_results).style def set_index(self, keys, drop=True, append=False, inplace=False): @@ -2845,22 +2952,24 @@ def set_index(self, keys, drop=True, append=False, inplace=False): else: column_labels = self._internal.column_labels if append: - index_map = self._internal.index_map + [(self._internal.column_name_for(label), label) - for label in keys] + index_map = self._internal.index_map + [ + (self._internal.column_name_for(label), label) for label in keys + ] else: index_map = [(self._internal.column_name_for(label), label) for label in keys] - internal = self._internal.copy(index_map=index_map, - column_labels=column_labels, - column_scols=[self._internal.scol_for(label) - for label in column_labels]) + internal = self._internal.copy( + index_map=index_map, + column_labels=column_labels, + column_scols=[self._internal.scol_for(label) for label in column_labels], + ) if inplace: self._internal = internal else: return DataFrame(internal) - def reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill=''): + def reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill=""): """Reset the index, or a level of it. For DataFrame with multi-level index, return new DataFrame with labeling information in @@ -3008,16 +3117,18 @@ class max type def rename(index): if multi_index: - return ('level_{}'.format(index),) + return ("level_{}".format(index),) else: - if ('index',) not in self._internal.column_labels: - return ('index',) + if ("index",) not in self._internal.column_labels: + return ("index",) else: - return ('level_{}'.format(index),) + return ("level_{}".format(index),) if level is None: - new_index_map = [(column, name if name is not None else rename(i)) - for i, (column, name) in enumerate(self._internal.index_map)] + new_index_map = [ + (column, name if name is not None else rename(i)) + for i, (column, name) in enumerate(self._internal.index_map) + ] index_map = [] else: if isinstance(level, (int, str)): @@ -3027,8 +3138,11 @@ def rename(index): if all(isinstance(l, int) for l in level): for lev in level: if lev >= len(self._internal.index_map): - raise IndexError('Too many levels: Index has only {} level, not {}' - .format(len(self._internal.index_map), lev + 1)) + raise IndexError( + "Too many levels: Index has only {} level, not {}".format( + len(self._internal.index_map), lev + 1 + ) + ) idx = level elif all(isinstance(lev, str) for lev in level): idx = [] @@ -3038,12 +3152,15 @@ def rename(index): idx.append(i) except ValueError: if multi_index: - raise KeyError('Level unknown not found') + raise KeyError("Level unknown not found") else: - raise KeyError('Level unknown must be same as name ({})' - .format(self._internal.index_columns[0])) + raise KeyError( + "Level unknown must be same as name ({})".format( + self._internal.index_columns[0] + ) + ) else: - raise ValueError('Level should be all int or all string.') + raise ValueError("Level should be all int or all string.") idx.sort() new_index_map = [] @@ -3052,21 +3169,24 @@ def rename(index): info = self._internal.index_map[i] index_column, index_name = info new_index_map.append( - (index_column, - index_name if index_name is not None else rename(i))) + (index_column, index_name if index_name is not None else rename(i)) + ) index_map.remove(info) - new_data_scols = [self._internal.scol_for(column).alias(name_like_string(name)) - for column, name in new_index_map] + new_data_scols = [ + self._internal.scol_for(column).alias(name_like_string(name)) + for column, name in new_index_map + ] if len(index_map) > 0: index_scols = [scol_for(self._sdf, column) for column, _ in index_map] sdf = self._sdf.select( - index_scols + new_data_scols + self._internal.column_scols + - list(HIDDEN_COLUMNS)) + index_scols + new_data_scols + self._internal.column_scols + list(HIDDEN_COLUMNS) + ) else: sdf = self._sdf.select( - new_data_scols + self._internal.column_scols + list(HIDDEN_COLUMNS)) + new_data_scols + self._internal.column_scols + list(HIDDEN_COLUMNS) + ) # Now, new internal Spark columns are named as same as index name. new_index_map = [(column, name) for column, name in new_index_map] @@ -3080,15 +3200,21 @@ def rename(index): if self._internal.column_labels_level > 1: column_depth = len(self._internal.column_labels[0]) if col_level >= column_depth: - raise IndexError('Too many levels: Index has only {} levels, not {}' - .format(column_depth, col_level + 1)) + raise IndexError( + "Too many levels: Index has only {} levels, not {}".format( + column_depth, col_level + 1 + ) + ) if any(col_level + len(name) > column_depth for _, name in new_index_map): - raise ValueError('Item must have length equal to number of levels.') - column_labels = ([tuple(([col_fill] * col_level) - + list(name) - + ([col_fill] * (column_depth - (len(name) + col_level)))) - for _, name in new_index_map] - + self._internal.column_labels) + raise ValueError("Item must have length equal to number of levels.") + column_labels = [ + tuple( + ([col_fill] * col_level) + + list(name) + + ([col_fill] * (column_depth - (len(name) + col_level))) + ) + for _, name in new_index_map + ] + self._internal.column_labels else: column_labels = [name for _, name in new_index_map] + self._internal.column_labels @@ -3096,8 +3222,11 @@ def rename(index): sdf=sdf, index_map=index_map, column_labels=column_labels, - column_scols=([scol_for(sdf, name_like_string(name)) for _, name in new_index_map] - + [scol_for(sdf, col) for col in self._internal.data_columns])) + column_scols=( + [scol_for(sdf, name_like_string(name)) for _, name in new_index_map] + + [scol_for(sdf, col) for col in self._internal.data_columns] + ), + ) if inplace: self._internal = internal @@ -3292,8 +3421,13 @@ def diff(self, periods: int = 1, axis: Union[int, str] = 0): return self._apply_series_op(lambda kser: kser.diff(periods)) # TODO: axis should support 1 or 'columns' either at this moment - def nunique(self, axis: Union[int, str] = 0, dropna: bool = True, approx: bool = False, - rsd: float = 0.05) -> pd.Series: + def nunique( + self, + axis: Union[int, str] = 0, + dropna: bool = True, + approx: bool = False, + rsd: float = 0.05, + ) -> pd.Series: """ Return number of unique elements in the object. @@ -3342,8 +3476,9 @@ def nunique(self, axis: Union[int, str] = 0, dropna: bool = True, approx: bool = axis = validate_axis(axis) if axis != 0: raise NotImplementedError('axis should be either 0 or "index" currently.') - res = self._sdf.select([self[label]._nunique(dropna, approx, rsd) - for label in self._internal.column_labels]).toPandas() + res = self._sdf.select( + [self[label]._nunique(dropna, approx, rsd) for label in self._internal.column_labels] + ).toPandas() if self._internal.column_labels_level == 1: res.columns = [label[0] for label in self._internal.column_labels] else: @@ -3412,11 +3547,12 @@ def round(self, decimals=0): third 0.9 0.0 0.49 """ if isinstance(decimals, ks.Series): - decimals = {k if isinstance(k, tuple) else (k,): v - for k, v in decimals._to_internal_pandas().items()} + decimals = { + k if isinstance(k, tuple) else (k,): v + for k, v in decimals._to_internal_pandas().items() + } elif isinstance(decimals, dict): - decimals = {k if isinstance(k, tuple) else (k,): v - for k, v in decimals.items()} + decimals = {k if isinstance(k, tuple) else (k,): v for k, v in decimals.items()} elif isinstance(decimals, int): decimals = {k: decimals for k in self._internal.column_labels} else: @@ -3431,7 +3567,7 @@ def op(kser): return self._apply_series_op(op) - def duplicated(self, subset=None, keep='first'): + def duplicated(self, subset=None, keep="first"): """ Return boolean Series denoting duplicate rows, optionally only considering certain columns. @@ -3486,6 +3622,7 @@ def duplicated(self, subset=None, keep='first'): Name: 0, dtype: bool """ from databricks.koalas.series import _col + if len(self._internal.index_names) > 1: raise ValueError("Now we don't support multi-index Now.") @@ -3500,43 +3637,52 @@ def duplicated(self, subset=None, keep='first'): subset = [sub if isinstance(sub, tuple) else (sub,) for sub in subset] diff = set(subset).difference(set(self._internal.column_labels)) if len(diff) > 0: - raise KeyError(', '.join([str(d) if len(d) > 1 else d[0] for d in diff])) + raise KeyError(", ".join([str(d) if len(d) > 1 else d[0] for d in diff])) group_cols = [self._internal.column_name_for(label) for label in subset] index_column = self._internal.index_columns[0] if self._internal.index_names[0] is not None: name = self._internal.index_names[0] else: - name = ('0',) + name = ("0",) column = name_like_string(name) sdf = self._sdf if column == index_column: index_column = SPARK_DEFAULT_INDEX_NAME - sdf = sdf.select([self._internal.index_scols[0].alias(index_column)] - + self._internal.data_scols) + sdf = sdf.select( + [self._internal.index_scols[0].alias(index_column)] + self._internal.data_scols + ) - if keep == 'first' or keep == 'last': - if keep == 'first': + if keep == "first" or keep == "last": + if keep == "first": ord_func = spark.functions.asc else: ord_func = spark.functions.desc - window = Window.partitionBy(group_cols) \ - .orderBy(ord_func(NATURAL_ORDER_COLUMN_NAME)) \ + window = ( + Window.partitionBy(group_cols) + .orderBy(ord_func(NATURAL_ORDER_COLUMN_NAME)) .rowsBetween(Window.unboundedPreceding, Window.currentRow) + ) sdf = sdf.withColumn(column, F.row_number().over(window) > 1) elif not keep: - window = Window.partitionBy(group_cols) \ - .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) - sdf = sdf.withColumn(column, F.count('*').over(window) > 1) + window = Window.partitionBy(group_cols).rowsBetween( + Window.unboundedPreceding, Window.unboundedFollowing + ) + sdf = sdf.withColumn(column, F.count("*").over(window) > 1) else: raise ValueError("'keep' only support 'first', 'last' and False") sdf = sdf.select(scol_for(sdf, index_column), scol_for(sdf, column)) - return _col(DataFrame(_InternalFrame(sdf=sdf, - index_map=[(index_column, - self._internal.index_names[0])], - column_labels=[name], - column_scols=[scol_for(sdf, column)]))) + return _col( + DataFrame( + _InternalFrame( + sdf=sdf, + index_map=[(index_column, self._internal.index_names[0])], + column_labels=[name], + column_scols=[scol_for(sdf, column)], + ) + ) + ) def to_koalas(self, index_col: Optional[Union[str, List[str]]] = None): """ @@ -3598,6 +3744,7 @@ def to_koalas(self, index_col: Optional[Union[str, List[str]]] = None): else: assert isinstance(self, spark.DataFrame), type(self) from databricks.koalas.namespace import _get_index_map + index_map = _get_index_map(self, index_col) internal = _InternalFrame(sdf=self, index_map=index_map) return DataFrame(internal) @@ -3641,9 +3788,14 @@ def cache(self): """ return _CachedDataFrame(self._internal) - def to_table(self, name: str, format: Optional[str] = None, mode: str = 'overwrite', - partition_cols: Union[str, List[str], None] = None, - **options): + def to_table( + self, + name: str, + format: Optional[str] = None, + mode: str = "overwrite", + partition_cols: Union[str, List[str], None] = None, + **options + ): """ Write the DataFrame into a Spark table. @@ -3694,11 +3846,17 @@ def to_table(self, name: str, format: Optional[str] = None, mode: str = 'overwri >>> df.to_table('%s.my_table' % db, partition_cols='date') """ - self.to_spark().write.saveAsTable(name=name, format=format, mode=mode, - partitionBy=partition_cols, **options) + self.to_spark().write.saveAsTable( + name=name, format=format, mode=mode, partitionBy=partition_cols, **options + ) - def to_delta(self, path: str, mode: str = 'overwrite', - partition_cols: Union[str, List[str], None] = None, **options): + def to_delta( + self, + path: str, + mode: str = "overwrite", + partition_cols: Union[str, List[str], None] = None, + **options + ): """ Write the DataFrame out as a Delta Lake table. @@ -3754,11 +3912,16 @@ def to_delta(self, path: str, mode: str = 'overwrite', ... mode='overwrite', replaceWhere='date >= "2012-01-01"') """ self.to_spark_io( - path=path, mode=mode, format="delta", partition_cols=partition_cols, **options) - - def to_parquet(self, path: str, mode: str = 'overwrite', - partition_cols: Union[str, List[str], None] = None, - compression: Optional[str] = None): + path=path, mode=mode, format="delta", partition_cols=partition_cols, **options + ) + + def to_parquet( + self, + path: str, + mode: str = "overwrite", + partition_cols: Union[str, List[str], None] = None, + compression: Optional[str] = None, + ): """ Write the DataFrame out as a Parquet file or directory. @@ -3808,11 +3971,17 @@ def to_parquet(self, path: str, mode: str = 'overwrite', ... partition_cols=['date', 'country']) """ self.to_spark().write.parquet( - path=path, mode=mode, partitionBy=partition_cols, compression=compression) - - def to_spark_io(self, path: Optional[str] = None, format: Optional[str] = None, - mode: str = 'overwrite', partition_cols: Union[str, List[str], None] = None, - **options): + path=path, mode=mode, partitionBy=partition_cols, compression=compression + ) + + def to_spark_io( + self, + path: Optional[str] = None, + format: Optional[str] = None, + mode: str = "overwrite", + partition_cols: Union[str, List[str], None] = None, + **options + ): """Write the DataFrame out to a Spark data source. Parameters @@ -3861,7 +4030,8 @@ def to_spark_io(self, path: Optional[str] = None, format: Optional[str] = None, >>> df.to_spark_io(path='%s/to_spark_io/foo.json' % path, format='json') """ self.to_spark().write.save( - path=path, format=format, mode=mode, partitionBy=partition_cols, **options) + path=path, format=format, mode=mode, partitionBy=partition_cols, **options + ) def to_spark(self, index_col: Optional[Union[str, List[str]]] = None): """ @@ -3944,8 +4114,9 @@ def to_spark(self, index_col: Optional[Union[str, List[str]]] = None): data_column_names = [] data_columns = [] - data_columns_column_labels = \ - zip(self._internal.data_columns, self._internal.column_labels) + data_columns_column_labels = zip( + self._internal.data_columns, self._internal.column_labels + ) # TODO: this code is similar with _InternalFrame.spark_df. Might have to deduplicate. for i, (column, label) in enumerate(data_columns_column_labels): scol = self._internal.scol_for(label) @@ -3960,15 +4131,16 @@ def to_spark(self, index_col: Optional[Union[str, List[str]]] = None): if len(index_col) != len(old_index_scols): raise ValueError( "length of index columns is %s; however, the length of the given " - "'index_col' is %s." % (len(old_index_scols), len(index_col))) + "'index_col' is %s." % (len(old_index_scols), len(index_col)) + ) if any(col in data_column_names for col in index_col): - raise ValueError( - "'index_col' cannot be overlapped with other columns.") + raise ValueError("'index_col' cannot be overlapped with other columns.") sdf = self._internal.spark_internal_df new_index_scols = [ - index_scol.alias(col) for index_scol, col in zip(old_index_scols, index_col)] + index_scol.alias(col) for index_scol, col in zip(old_index_scols, index_col) + ] return sdf.select(new_index_scols + data_columns) def to_pandas(self): @@ -4056,26 +4228,30 @@ def assign(self, **kwargs): def _assign(self, kwargs): assert isinstance(kwargs, dict) from databricks.koalas.series import Series + for k, v in kwargs.items(): - if not (isinstance(v, (Series, spark.Column)) or - callable(v) or pd.api.types.is_scalar(v)): - raise TypeError("Column assignment doesn't support type " - "{0}".format(type(v).__name__)) + if not ( + isinstance(v, (Series, spark.Column)) or callable(v) or pd.api.types.is_scalar(v) + ): + raise TypeError( + "Column assignment doesn't support type " "{0}".format(type(v).__name__) + ) if callable(v): kwargs[k] = v(self) - pairs = {(k if isinstance(k, tuple) else (k,)): - (v._scol if isinstance(v, Series) - else v if isinstance(v, spark.Column) - else F.lit(v)) - for k, v in kwargs.items()} + pairs = { + (k if isinstance(k, tuple) else (k,)): ( + v._scol if isinstance(v, Series) else v if isinstance(v, spark.Column) else F.lit(v) + ) + for k, v in kwargs.items() + } scols = [] for label in self._internal.column_labels: for i in range(len(label)): - if label[:len(label)-i] in pairs: + if label[: len(label) - i] in pairs: name = self._internal.column_name_for(label) - scol = pairs[label[:len(label)-i]].alias(name) + scol = pairs[label[: len(label) - i]].alias(name) break else: scol = self._internal.scol_for(label) @@ -4083,22 +4259,27 @@ def _assign(self, kwargs): column_labels = self._internal.column_labels.copy() for label, scol in pairs.items(): - if label not in set(i[:len(label)] for i in self._internal.column_labels): + if label not in set(i[: len(label)] for i in self._internal.column_labels): scols.append(scol.alias(name_like_string(label))) column_labels.append(label) level = self._internal.column_labels_level - column_labels = [tuple(list(label) + ([''] * (level - len(label)))) - for label in column_labels] + column_labels = [ + tuple(list(label) + ([""] * (level - len(label)))) for label in column_labels + ] internal = self._internal.with_new_columns(scols, column_labels=column_labels) return DataFrame(internal) @staticmethod - def from_records(data: Union[np.array, List[tuple], dict, pd.DataFrame], - index: Union[str, list, np.array] = None, exclude: list = None, - columns: list = None, coerce_float: bool = False, nrows: int = None) \ - -> 'DataFrame': + def from_records( + data: Union[np.array, List[tuple], dict, pd.DataFrame], + index: Union[str, list, np.array] = None, + exclude: list = None, + columns: list = None, + coerce_float: bool = False, + nrows: int = None, + ) -> "DataFrame": """ Convert structured or record ndarray to DataFrame. @@ -4148,8 +4329,9 @@ def from_records(data: Union[np.array, List[tuple], dict, pd.DataFrame], 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - return DataFrame(pd.DataFrame.from_records(data, index, exclude, columns, coerce_float, - nrows)) + return DataFrame( + pd.DataFrame.from_records(data, index, exclude, columns, coerce_float, nrows) + ) def to_records(self, index=True, column_dtypes=None, index_dtypes=None): """ @@ -4227,9 +4409,10 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None): kdf = self return validate_arguments_and_invoke_function( - kdf._to_internal_pandas(), self.to_records, pd.DataFrame.to_records, args) + kdf._to_internal_pandas(), self.to_records, pd.DataFrame.to_records, args + ) - def copy(self) -> 'DataFrame': + def copy(self) -> "DataFrame": """ Make a copy of this object's indices and data. @@ -4253,7 +4436,7 @@ def copy(self) -> 'DataFrame': """ return DataFrame(self._internal.copy()) - def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): + def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): """ Remove missing values. @@ -4346,28 +4529,28 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): labels = [subset] else: labels = [sub if isinstance(sub, tuple) else (sub,) for sub in subset] - invalids = [label for label in labels - if label not in self._internal.column_labels] + invalids = [label for label in labels if label not in self._internal.column_labels] if len(invalids) > 0: raise KeyError(invalids) else: labels = self._internal.column_labels - cnt = reduce(lambda x, y: x + y, - [F.when(self[label].notna()._scol, 1).otherwise(0) - for label in labels], - F.lit(0)) + cnt = reduce( + lambda x, y: x + y, + [F.when(self[label].notna()._scol, 1).otherwise(0) for label in labels], + F.lit(0), + ) if thresh is not None: pred = cnt >= F.lit(int(thresh)) - elif how == 'any': + elif how == "any": pred = cnt == F.lit(len(labels)) - elif how == 'all': + elif how == "all": pred = cnt > F.lit(0) else: if how is not None: - raise ValueError('invalid how option: {h}'.format(h=how)) + raise ValueError("invalid how option: {h}".format(h=how)) else: - raise TypeError('must specify how or thresh') + raise TypeError("must specify how or thresh") internal = self._internal.with_filter(pred) if inplace: @@ -4466,7 +4649,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None): if not isinstance(value, (float, int, str, bool, dict, pd.Series)): raise TypeError("Unsupported type %s" % type(value)) if limit is not None: - raise ValueError('limit parameter for value is not support now') + raise ValueError("limit parameter for value is not support now") if isinstance(value, pd.Series): value = value.to_dict() if isinstance(value, dict): @@ -4478,17 +4661,21 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None): def op(kser): label = kser._internal.column_labels[0] for k, v in value.items(): - if k == label[:len(k)]: - return kser.fillna(value=value[k], method=method, axis=axis, - inplace=False, limit=limit) + if k == label[: len(k)]: + return kser.fillna( + value=value[k], method=method, axis=axis, inplace=False, limit=limit + ) else: return kser + else: - op = lambda kser: kser.fillna(value=value, method=method, axis=axis, - inplace=False, limit=limit) + op = lambda kser: kser.fillna( + value=value, method=method, axis=axis, inplace=False, limit=limit + ) elif method is not None: - op = lambda kser: kser.fillna(value=value, method=method, axis=axis, - inplace=False, limit=limit) + op = lambda kser: kser.fillna( + value=value, method=method, axis=axis, inplace=False, limit=limit + ) else: raise ValueError("Must specify a fillna 'value' or 'method' parameter.") @@ -4551,7 +4738,7 @@ def bfill(self, axis=None, inplace=False, limit=None): 2 NaN 3.0 1.0 5 3 NaN 3.0 1.0 4 """ - return self.fillna(method='bfill', axis=axis, inplace=inplace, limit=limit) + return self.fillna(method="bfill", axis=axis, inplace=inplace, limit=limit) # TODO: add 'downcast' when value parameter exists def ffill(self, axis=None, inplace=False, limit=None): @@ -4606,10 +4793,18 @@ def ffill(self, axis=None, inplace=False, limit=None): 2 3.0 4.0 NaN 5 3 3.0 3.0 1.0 4 """ - return self.fillna(method='ffill', axis=axis, inplace=inplace, limit=limit) - - def replace(self, to_replace=None, value=None, subset=None, inplace=False, - limit=None, regex=False, method='pad'): + return self.fillna(method="ffill", axis=axis, inplace=inplace, limit=limit) + + def replace( + self, + to_replace=None, + value=None, + subset=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): """ Returns a new DataFrame replacing a value with another value. @@ -4704,7 +4899,7 @@ def replace(self, to_replace=None, value=None, subset=None, inplace=False, parameter. Calling `replace` on its index such as `df.replace({0: 10, 1: 100})` will throw an error. Instead specify column-name like `df.replace({'A': {0: 10, 1: 100}})`. """ - if method != 'pad': + if method != "pad": raise NotImplementedError("replace currently works only for method='pad") if limit is not None: raise NotImplementedError("replace currently works only when limit=None") @@ -4719,7 +4914,7 @@ def replace(self, to_replace=None, value=None, subset=None, inplace=False, if isinstance(value, list) and isinstance(to_replace, list): if len(value) != len(to_replace): - raise ValueError('Length of to_replace and value must be same') + raise ValueError("Length of to_replace and value must be same") # TODO: Do we still need to support this argument? if subset is None: @@ -4733,8 +4928,11 @@ def replace(self, to_replace=None, value=None, subset=None, inplace=False, subset = [self._internal.column_name_for(label) for label in subset] sdf = self._sdf - if isinstance(to_replace, dict) and value is None and \ - (not any(isinstance(i, dict) for i in to_replace.values())): + if ( + isinstance(to_replace, dict) + and value is None + and (not any(isinstance(i, dict) for i in to_replace.values())) + ): sdf = sdf.replace(to_replace, value, subset) elif isinstance(to_replace, dict): for name, replacement in to_replace.items(): @@ -4744,9 +4942,12 @@ def replace(self, to_replace=None, value=None, subset=None, inplace=False, if isinstance(replacement, dict): sdf = sdf.replace(replacement, subset=df_column) else: - sdf = sdf.withColumn(df_column, - F.when(scol_for(sdf, df_column) == replacement, value) - .otherwise(scol_for(sdf, df_column))) + sdf = sdf.withColumn( + df_column, + F.when(scol_for(sdf, df_column) == replacement, value).otherwise( + scol_for(sdf, df_column) + ), + ) else: sdf = sdf.replace(to_replace, value, subset) @@ -4756,8 +4957,7 @@ def replace(self, to_replace=None, value=None, subset=None, inplace=False, else: return DataFrame(internal) - def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None) \ - -> 'DataFrame': + def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None) -> "DataFrame": """ Trim values at input threshold(s). @@ -4791,14 +4991,22 @@ def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None) will output the original DataFrame, simply ignoring the incompatible types. """ if is_list_like(lower) or is_list_like(upper): - raise ValueError("List-like value are not supported for 'lower' and 'upper' at the " + - "moment") + raise ValueError( + "List-like value are not supported for 'lower' and 'upper' at the " + "moment" + ) if lower is None and upper is None: return self - numeric_types = (DecimalType, DoubleType, FloatType, ByteType, IntegerType, LongType, - ShortType) + numeric_types = ( + DecimalType, + DoubleType, + FloatType, + ByteType, + IntegerType, + LongType, + ShortType, + ) def op(kser): if isinstance(kser.spark_type, numeric_types): @@ -4865,14 +5073,13 @@ def head(self, n=5): 1 bee 2 falcon """ - if get_option('compute.ordered_head'): + if get_option("compute.ordered_head"): sdf = self._sdf.orderBy(NATURAL_ORDER_COLUMN_NAME) else: sdf = self._sdf return DataFrame(self._internal.with_new_sdf(sdf.limit(n))) - def pivot_table(self, values=None, index=None, columns=None, - aggfunc='mean', fill_value=None): + def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical @@ -4976,18 +5183,24 @@ def pivot_table(self, values=None, index=None, columns=None, raise ValueError("columns should be string or tuple.") if not isinstance(values, (str, tuple)) and not isinstance(values, list): - raise ValueError('values should be string or list of one column.') - - if not isinstance(aggfunc, str) and \ - (not isinstance(aggfunc, dict) or - not all(isinstance(key, (str, tuple)) and isinstance(value, str) - for key, value in aggfunc.items())): - raise ValueError("aggfunc must be a dict mapping from column name (string or tuple) " - "to aggregate functions (string).") + raise ValueError("values should be string or list of one column.") + + if not isinstance(aggfunc, str) and ( + not isinstance(aggfunc, dict) + or not all( + isinstance(key, (str, tuple)) and isinstance(value, str) + for key, value in aggfunc.items() + ) + ): + raise ValueError( + "aggfunc must be a dict mapping from column name (string or tuple) " + "to aggregate functions (string)." + ) if isinstance(aggfunc, dict) and index is None: - raise NotImplementedError("pivot_table doesn't support aggfunc" - " as dict and without index.") + raise NotImplementedError( + "pivot_table doesn't support aggfunc" " as dict and without index." + ) if isinstance(values, list) and index is None: raise NotImplementedError("values can't be a list without index.") @@ -4998,41 +5211,58 @@ def pivot_table(self, values=None, index=None, columns=None, if isinstance(values, list): values = [col if isinstance(col, tuple) else (col,) for col in values] - if not all(isinstance(self._internal.spark_type_for(col), NumericType) - for col in values): - raise TypeError('values should be a numeric type.') + if not all( + isinstance(self._internal.spark_type_for(col), NumericType) for col in values + ): + raise TypeError("values should be a numeric type.") else: values = values if isinstance(values, tuple) else (values,) if not isinstance(self._internal.spark_type_for(values), NumericType): - raise TypeError('values should be a numeric type.') + raise TypeError("values should be a numeric type.") if isinstance(aggfunc, str): if isinstance(values, list): - agg_cols = [F.expr('{1}(`{0}`) as `{0}`' - .format(self._internal.column_name_for(value), aggfunc)) - for value in values] + agg_cols = [ + F.expr( + "{1}(`{0}`) as `{0}`".format(self._internal.column_name_for(value), aggfunc) + ) + for value in values + ] else: - agg_cols = [F.expr('{1}(`{0}`) as `{0}`' - .format(self._internal.column_name_for(values), aggfunc))] + agg_cols = [ + F.expr( + "{1}(`{0}`) as `{0}`".format( + self._internal.column_name_for(values), aggfunc + ) + ) + ] elif isinstance(aggfunc, dict): - aggfunc = {key if isinstance(key, tuple) else (key,): value - for key, value in aggfunc.items()} - agg_cols = [F.expr('{1}(`{0}`) as `{0}`' - .format(self._internal.column_name_for(key), value)) - for key, value in aggfunc.items()] + aggfunc = { + key if isinstance(key, tuple) else (key,): value for key, value in aggfunc.items() + } + agg_cols = [ + F.expr("{1}(`{0}`) as `{0}`".format(self._internal.column_name_for(key), value)) + for key, value in aggfunc.items() + ] agg_columns = [key for key, _ in aggfunc.items()] if set(agg_columns) != set(values): raise ValueError("Columns in aggfunc must be the same as values.") if index is None: - sdf = self._sdf.groupBy() \ - .pivot(pivot_col=self._internal.column_name_for(columns)).agg(*agg_cols) + sdf = ( + self._sdf.groupBy() + .pivot(pivot_col=self._internal.column_name_for(columns)) + .agg(*agg_cols) + ) elif isinstance(index, list): index = [label if isinstance(label, tuple) else (label,) for label in index] - sdf = self._sdf.groupBy([self._internal.scol_for(label) for label in index]) \ - .pivot(pivot_col=self._internal.column_name_for(columns)).agg(*agg_cols) + sdf = ( + self._sdf.groupBy([self._internal.scol_for(label) for label in index]) + .pivot(pivot_col=self._internal.column_name_for(columns)) + .agg(*agg_cols) + ) else: raise ValueError("index should be a None or a list of columns.") @@ -5053,44 +5283,50 @@ def pivot_table(self, values=None, index=None, columns=None, # then ['2_b', '2_e', '3_b', '3_e']. # We sort the columns of Spark DataFrame by values. - data_columns.sort(key=lambda x: x.split('_', 1)[1]) + data_columns.sort(key=lambda x: x.split("_", 1)[1]) sdf = sdf.select(index_columns + data_columns) - column_name_to_index = dict(zip(self._internal.data_columns, - self._internal.column_labels)) - column_labels = [tuple(list(column_name_to_index[name.split('_')[1]]) - + [name.split('_')[0]]) - for name in data_columns] + column_name_to_index = dict( + zip(self._internal.data_columns, self._internal.column_labels) + ) + column_labels = [ + tuple(list(column_name_to_index[name.split("_")[1]]) + [name.split("_")[0]]) + for name in data_columns + ] index_map = list(zip(index_columns, index)) - column_label_names = (([None] * column_labels_level(values)) - + [str(columns) if len(columns) > 1 else columns[0]]) - internal = _InternalFrame(sdf=sdf, - index_map=index_map, - column_labels=column_labels, - column_scols=[scol_for(sdf, col) - for col in data_columns], - column_label_names=column_label_names) + column_label_names = ([None] * column_labels_level(values)) + [ + str(columns) if len(columns) > 1 else columns[0] + ] + internal = _InternalFrame( + sdf=sdf, + index_map=index_map, + column_labels=column_labels, + column_scols=[scol_for(sdf, col) for col in data_columns], + column_label_names=column_label_names, + ) kdf = DataFrame(internal) else: column_labels = [tuple(list(values[0]) + [column]) for column in data_columns] index_map = list(zip(index_columns, index)) - column_label_names = (([None] * len(values[0])) - + [str(columns) if len(columns) > 1 else columns[0]]) - internal = _InternalFrame(sdf=sdf, - index_map=index_map, - column_labels=column_labels, - column_scols=[scol_for(sdf, col) - for col in data_columns], - column_label_names=column_label_names) + column_label_names = ([None] * len(values[0])) + [ + str(columns) if len(columns) > 1 else columns[0] + ] + internal = _InternalFrame( + sdf=sdf, + index_map=index_map, + column_labels=column_labels, + column_scols=[scol_for(sdf, col) for col in data_columns], + column_label_names=column_label_names, + ) kdf = DataFrame(internal) return kdf else: index_columns = [self._internal.column_name_for(label) for label in index] index_map = list(zip(index_columns, index)) column_label_names = [str(columns) if len(columns) > 1 else columns[0]] - internal = _InternalFrame(sdf=sdf, - index_map=index_map, - column_label_names=column_label_names) + internal = _InternalFrame( + sdf=sdf, index_map=index_map, column_label_names=column_label_names + ) return DataFrame(internal) else: if isinstance(values, list): @@ -5103,9 +5339,9 @@ def pivot_table(self, values=None, index=None, columns=None, sdf = sdf.withColumn(colname, F.lit(index_value)) index_map.append((colname, None)) column_label_names = [str(columns) if len(columns) > 1 else columns[0]] - internal = _InternalFrame(sdf=sdf, - index_map=index_map, - column_label_names=column_label_names) + internal = _InternalFrame( + sdf=sdf, index_map=index_map, column_label_names=column_label_names + ) return DataFrame(internal) def pivot(self, index=None, columns=None, values=None): @@ -5204,13 +5440,12 @@ def pivot(self, index=None, columns=None, values=None): index = [index] else: df = self.copy() - df['__DUMMY__'] = F.monotonically_increasing_id() - df.set_index('__DUMMY__', append=True, inplace=True) + df["__DUMMY__"] = F.monotonically_increasing_id() + df.set_index("__DUMMY__", append=True, inplace=True) df.reset_index(level=range(len(df._internal.index_map) - 1), inplace=True) - index = df._internal.column_labels[:len(df._internal.index_map)] + index = df._internal.column_labels[: len(df._internal.index_map)] - df = df.pivot_table( - index=index, columns=columns, values=values, aggfunc='first') + df = df.pivot_table(index=index, columns=columns, values=values, aggfunc="first") if should_use_existing_index: return df @@ -5218,7 +5453,8 @@ def pivot(self, index=None, columns=None, values=None): index_columns = df._internal.index_columns # Note that the existing indexing column won't exist in the pivoted DataFrame. internal = df._internal.copy( - index_map=[(index_column, None) for index_column in index_columns]) + index_map=[(index_column, None) for index_column in index_columns] + ) return DataFrame(internal) @property @@ -5240,29 +5476,39 @@ def columns(self, columns): if len(old_names) != len(column_labels): raise ValueError( "Length mismatch: Expected axis has %d elements, new values have %d elements" - % (len(old_names), len(column_labels))) + % (len(old_names), len(column_labels)) + ) column_label_names = columns.names data_columns = [name_like_string(label) for label in column_labels] - column_scols = [self._internal.scol_for(label).alias(name) - for label, name in zip(self._internal.column_labels, data_columns)] - self._internal = self._internal.with_new_columns(column_scols, - column_labels=column_labels) + column_scols = [ + self._internal.scol_for(label).alias(name) + for label, name in zip(self._internal.column_labels, data_columns) + ] + self._internal = self._internal.with_new_columns( + column_scols, column_labels=column_labels + ) sdf = self._sdf.select( - self._internal.index_scols + - [self._internal.scol_for(label).alias(name) - for label, name in zip(self._internal.column_labels, data_columns)] + - list(HIDDEN_COLUMNS)) + self._internal.index_scols + + [ + self._internal.scol_for(label).alias(name) + for label, name in zip(self._internal.column_labels, data_columns) + ] + + list(HIDDEN_COLUMNS) + ) column_scols = [scol_for(sdf, col) for col in data_columns] - self._internal = self._internal.copy(sdf=sdf, - column_labels=column_labels, - column_scols=column_scols, - column_label_names=column_label_names) + self._internal = self._internal.copy( + sdf=sdf, + column_labels=column_labels, + column_scols=column_scols, + column_label_names=column_label_names, + ) else: old_names = self._internal.column_labels if len(old_names) != len(columns): raise ValueError( "Length mismatch: Expected axis has %d elements, new values have %d elements" - % (len(old_names), len(columns))) + % (len(old_names), len(columns)) + ) column_labels = [col if isinstance(col, tuple) else (col,) for col in columns] if isinstance(columns, pd.Index): column_label_names = columns.names @@ -5270,15 +5516,20 @@ def columns(self, columns): column_label_names = None data_columns = [name_like_string(label) for label in column_labels] sdf = self._sdf.select( - self._internal.index_scols + - [self._internal.scol_for(label).alias(name) - for label, name in zip(self._internal.column_labels, data_columns)] + - list(HIDDEN_COLUMNS)) + self._internal.index_scols + + [ + self._internal.scol_for(label).alias(name) + for label, name in zip(self._internal.column_labels, data_columns) + ] + + list(HIDDEN_COLUMNS) + ) column_scols = [scol_for(sdf, col) for col in data_columns] - self._internal = self._internal.copy(sdf=sdf, - column_labels=column_labels, - column_scols=column_scols, - column_label_names=column_label_names) + self._internal = self._internal.copy( + sdf=sdf, + column_labels=column_labels, + column_scols=column_scols, + column_label_names=column_label_names, + ) @property def dtypes(self): @@ -5310,9 +5561,12 @@ def dtypes(self): f datetime64[ns] dtype: object """ - return pd.Series([self[label].dtype for label in self._internal.column_labels], - index=pd.Index([label if len(label) > 1 else label[0] - for label in self._internal.column_labels])) + return pd.Series( + [self[label].dtype for label in self._internal.column_labels], + index=pd.Index( + [label if len(label) > 1 else label[0] for label in self._internal.column_labels] + ), + ) def select_dtypes(self, include=None, exclude=None): """ @@ -5420,13 +5674,15 @@ def select_dtypes(self, include=None, exclude=None): exclude = (exclude,) if exclude is not None else () if not any((include, exclude)): - raise ValueError('at least one of include or exclude must be ' - 'nonempty') + raise ValueError("at least one of include or exclude must be " "nonempty") # can't both include AND exclude! if set(include).intersection(set(exclude)): - raise ValueError('include and exclude overlap on {inc_ex}'.format( - inc_ex=set(include).intersection(set(exclude)))) + raise ValueError( + "include and exclude overlap on {inc_ex}".format( + inc_ex=set(include).intersection(set(exclude)) + ) + ) # Handle Spark types include_spark_type = [] @@ -5462,12 +5718,14 @@ def select_dtypes(self, include=None, exclude=None): for label in self._internal.column_labels: if len(include) > 0: should_include = ( - infer_dtype_from_object(self[label].dtype.name) in include_numpy_type or - self._internal.spark_type_for(label) in include_spark_type) + infer_dtype_from_object(self[label].dtype.name) in include_numpy_type + or self._internal.spark_type_for(label) in include_spark_type + ) else: should_include = not ( - infer_dtype_from_object(self[label].dtype.name) in exclude_numpy_type or - self._internal.spark_type_for(label) in exclude_spark_type) + infer_dtype_from_object(self[label].dtype.name) in exclude_numpy_type + or self._internal.spark_type_for(label) in exclude_spark_type + ) if should_include: column_labels.append(label) @@ -5533,10 +5791,15 @@ def count(self, axis=None): Name: 0, dtype: int64 """ return self._reduce_for_stat_function( - _Frame._count_expr, name="count", axis=axis, numeric_only=False) - - def drop(self, labels=None, axis=1, - columns: Union[str, Tuple[str, ...], List[str], List[Tuple[str, ...]]] = None): + _Frame._count_expr, name="count", axis=axis, numeric_only=False + ) + + def drop( + self, + labels=None, + axis=1, + columns: Union[str, Tuple[str, ...], List[str], List[Tuple[str, ...]]] = None, + ): """ Drop specified labels from columns. @@ -5620,40 +5883,51 @@ def drop(self, labels=None, axis=1, elif isinstance(columns, tuple): columns = [columns] else: - columns = [col if isinstance(col, tuple) else (col,) # type: ignore - for col in columns] - drop_column_labels = set(label for label in self._internal.column_labels - for col in columns - if label[:len(col)] == col) + columns = [ # type: ignore + col if isinstance(col, tuple) else (col,) for col in columns # type: ignore + ] + drop_column_labels = set( + label + for label in self._internal.column_labels + for col in columns + if label[: len(col)] == col + ) if len(drop_column_labels) == 0: raise KeyError(columns) - cols, labels = zip(*((column, label) - for column, label - in zip(self._internal.data_columns, self._internal.column_labels) - if label not in drop_column_labels)) + cols, labels = zip( + *( + (column, label) + for column, label in zip( + self._internal.data_columns, self._internal.column_labels + ) + if label not in drop_column_labels + ) + ) column_scols = [self._internal.scol_for(label) for label in labels] internal = self._internal.with_new_columns(column_scols, column_labels=list(labels)) return DataFrame(internal) else: raise ValueError("Need to specify at least one of 'labels' or 'columns'") - def _sort(self, by: List[Column], ascending: Union[bool, List[bool]], - inplace: bool, na_position: str): + def _sort( + self, by: List[Column], ascending: Union[bool, List[bool]], inplace: bool, na_position: str + ): if isinstance(ascending, bool): ascending = [ascending] * len(by) if len(ascending) != len(by): - raise ValueError('Length of ascending ({}) != length of by ({})' - .format(len(ascending), len(by))) - if na_position not in ('first', 'last'): + raise ValueError( + "Length of ascending ({}) != length of by ({})".format(len(ascending), len(by)) + ) + if na_position not in ("first", "last"): raise ValueError("invalid na_position: '{}'".format(na_position)) # Mapper: Get a spark column function for (ascending, na_position) combination # Note that 'asc_nulls_first' and friends were added as of Spark 2.4, see SPARK-23847. mapper = { - (True, 'first'): lambda x: Column(getattr(x._jc, "asc_nulls_first")()), - (True, 'last'): lambda x: Column(getattr(x._jc, "asc_nulls_last")()), - (False, 'first'): lambda x: Column(getattr(x._jc, "desc_nulls_first")()), - (False, 'last'): lambda x: Column(getattr(x._jc, "desc_nulls_last")()), + (True, "first"): lambda x: Column(getattr(x._jc, "asc_nulls_first")()), + (True, "last"): lambda x: Column(getattr(x._jc, "asc_nulls_last")()), + (False, "first"): lambda x: Column(getattr(x._jc, "desc_nulls_first")()), + (False, "last"): lambda x: Column(getattr(x._jc, "desc_nulls_last")()), } by = [mapper[(asc, na_position)](scol) for scol, asc in zip(by, ascending)] sdf = self._sdf.sort(*(by + [NATURAL_ORDER_COLUMN_NAME])) @@ -5664,9 +5938,13 @@ def _sort(self, by: List[Column], ascending: Union[bool, List[bool]], else: return kdf - def sort_values(self, by: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]], - ascending: Union[bool, List[bool]] = True, - inplace: bool = False, na_position: str = 'last') -> Optional['DataFrame']: + def sort_values( + self, + by: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]], + ascending: Union[bool, List[bool]] = True, + inplace: bool = False, + na_position: str = "last", + ) -> Optional["DataFrame"]: """ Sort by the values along either axis. @@ -5751,16 +6029,21 @@ def sort_values(self, by: Union[str, List[str], Tuple[str, ...], List[Tuple[str, if not isinstance(ser, ks.Series): raise ValueError( "The column %s is not unique. For a multi-index, the label must be a tuple " - "with elements corresponding to each level." % name_like_string(colname)) + "with elements corresponding to each level." % name_like_string(colname) + ) new_by.append(ser._scol) - return self._sort(by=new_by, ascending=ascending, - inplace=inplace, na_position=na_position) + return self._sort(by=new_by, ascending=ascending, inplace=inplace, na_position=na_position) - def sort_index(self, axis: int = 0, - level: Optional[Union[int, List[int]]] = None, ascending: bool = True, - inplace: bool = False, kind: str = None, na_position: str = 'last') \ - -> Optional['DataFrame']: + def sort_index( + self, + axis: int = 0, + level: Optional[Union[int, List[int]]] = None, + ascending: bool = True, + inplace: bool = False, + kind: str = None, + na_position: str = "last", + ) -> Optional["DataFrame"]: """ Sort object by labels (along an axis) @@ -5843,7 +6126,8 @@ def sort_index(self, axis: int = 0, raise NotImplementedError("No other axis than 0 are supported at the moment") if kind is not None: raise NotImplementedError( - "Specifying the sorting algorithm is not supported at the moment.") + "Specifying the sorting algorithm is not supported at the moment." + ) if level is None or (is_list_like(level) and len(level) == 0): # type: ignore by = self._internal.index_scols @@ -5852,11 +6136,10 @@ def sort_index(self, axis: int = 0, else: by = [self._internal.index_scols[level]] - return self._sort(by=by, ascending=ascending, - inplace=inplace, na_position=na_position) + return self._sort(by=by, ascending=ascending, inplace=inplace, na_position=na_position) # TODO: add keep = First - def nlargest(self, n: int, columns: 'Any') -> 'DataFrame': + def nlargest(self, n: int, columns: "Any") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in descending order. @@ -5932,7 +6215,7 @@ def nlargest(self, n: int, columns: 'Any') -> 'DataFrame': return kdf.head(n=n) # TODO: add keep = First - def nsmallest(self, n: int, columns: 'Any') -> 'DataFrame': + def nsmallest(self, n: int, columns: "Any") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in ascending order. @@ -6046,23 +6329,29 @@ def isin(self, values): if isinstance(values, dict) and not set(values.keys()).issubset(self.columns): raise AttributeError( "'DataFrame' object has no attribute %s" - % (set(values.keys()).difference(self.columns))) + % (set(values.keys()).difference(self.columns)) + ) column_scols = [] if isinstance(values, dict): for i, col in enumerate(self.columns): if col in values: - column_scols.append(self._internal.scol_for(self._internal.column_labels[i]) - .isin(values[col]).alias(self._internal.data_columns[i])) + column_scols.append( + self._internal.scol_for(self._internal.column_labels[i]) + .isin(values[col]) + .alias(self._internal.data_columns[i]) + ) else: column_scols.append(F.lit(False).alias(self._internal.data_columns[i])) elif is_list_like(values): column_scols += [ - self._internal.scol_for(label).isin(list(values)) - .alias(self._internal.column_name_for(label)) - for label in self._internal.column_labels] + self._internal.scol_for(label) + .isin(list(values)) + .alias(self._internal.column_name_for(label)) + for label in self._internal.column_labels + ] else: - raise TypeError('Values should be iterable, Series, DataFrame or dict.') + raise TypeError("Values should be iterable, Series, DataFrame or dict.") return DataFrame(self._internal.with_new_columns(column_scols)) @@ -6084,12 +6373,17 @@ def shape(self): """ return len(self), len(self.columns) - def merge(self, right: 'DataFrame', how: str = 'inner', - on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, - left_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, - right_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, - left_index: bool = False, right_index: bool = False, - suffixes: Tuple[str, str] = ('_x', '_y')) -> 'DataFrame': + def merge( + self, + right: "DataFrame", + how: str = "inner", + on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, + left_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, + right_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, + left_index: bool = False, + right_index: bool = False, + suffixes: Tuple[str, str] = ("_x", "_y"), + ) -> "DataFrame": """ Merge DataFrame objects with a database-style join. @@ -6199,19 +6493,28 @@ def merge(self, right: 'DataFrame', how: str = 'inner', As described in #263, joining string columns currently returns None for missing values instead of NaN. """ - _to_list = lambda os: (os if os is None - else [os] if isinstance(os, tuple) - else [(os,)] if isinstance(os, str) - else [o if isinstance(o, tuple) else (o,) # type: ignore - for o in os]) + _to_list = lambda os: ( + os + if os is None + else [os] + if isinstance(os, tuple) + else [(os,)] + if isinstance(os, str) + else [ + o if isinstance(o, tuple) else (o,) # type: ignore + for o in os + ] + ) if isinstance(right, ks.Series): right = right.to_frame() if on: if left_on or right_on: - raise ValueError('Can only pass argument "on" OR "left_on" and "right_on", ' - 'not a combination of both.') + raise ValueError( + 'Can only pass argument "on" OR "left_on" and "right_on", ' + "not a combination of both." + ) left_keys = _to_list(on) right_keys = _to_list(on) else: @@ -6226,32 +6529,38 @@ def merge(self, right: 'DataFrame', how: str = 'inner', right_keys = _to_list(right_on) if left_keys and not right_keys: - raise ValueError('Must pass right_on or right_index=True') + raise ValueError("Must pass right_on or right_index=True") if right_keys and not left_keys: - raise ValueError('Must pass left_on or left_index=True') + raise ValueError("Must pass left_on or left_index=True") if not left_keys and not right_keys: common = list(self.columns.intersection(right.columns)) if len(common) == 0: raise ValueError( - 'No common columns to perform merge on. Merge options: ' - 'left_on=None, right_on=None, left_index=False, right_index=False') + "No common columns to perform merge on. Merge options: " + "left_on=None, right_on=None, left_index=False, right_index=False" + ) left_keys = _to_list(common) right_keys = _to_list(common) if len(left_keys) != len(right_keys): # type: ignore - raise ValueError('len(left_keys) must equal len(right_keys)') + raise ValueError("len(left_keys) must equal len(right_keys)") - if how == 'full': - warnings.warn("Warning: While Koalas will accept 'full', you should use 'outer' " + - "instead to be compatible with the pandas merge API", UserWarning) - if how == 'outer': + if how == "full": + warnings.warn( + "Warning: While Koalas will accept 'full', you should use 'outer' " + + "instead to be compatible with the pandas merge API", + UserWarning, + ) + if how == "outer": # 'outer' in pandas equals 'full' in Spark - how = 'full' - if how not in ('inner', 'left', 'right', 'full'): - raise ValueError("The 'how' parameter has to be amongst the following values: ", - "['inner', 'left', 'right', 'outer']") + how = "full" + if how not in ("inner", "left", "right", "full"): + raise ValueError( + "The 'how' parameter has to be amongst the following values: ", + "['inner', 'left', 'right', 'outer']", + ) - left_table = self._sdf.alias('left_table') - right_table = right._sdf.alias('right_table') + left_table = self._sdf.alias("left_table") + right_table = right._sdf.alias("right_table") left_scol_for = lambda label: scol_for(left_table, self._internal.column_name_for(label)) right_scol_for = lambda label: scol_for(right_table, right._internal.column_name_for(label)) @@ -6259,9 +6568,10 @@ def merge(self, right: 'DataFrame', how: str = 'inner', left_key_columns = [left_scol_for(label) for label in left_keys] # type: ignore right_key_columns = [right_scol_for(label) for label in right_keys] # type: ignore - join_condition = reduce(lambda x, y: x & y, - [lkey == rkey for lkey, rkey - in zip(left_key_columns, right_key_columns)]) + join_condition = reduce( + lambda x, y: x & y, + [lkey == rkey for lkey, rkey in zip(left_key_columns, right_key_columns)], + ) joined_table = left_table.join(right_table, join_condition, how=how) @@ -6270,8 +6580,7 @@ def merge(self, right: 'DataFrame', how: str = 'inner', right_suffix = suffixes[1] # Append suffixes to columns with the same name to avoid conflicts later - duplicate_columns = (set(self._internal.column_labels) - & set(right._internal.column_labels)) + duplicate_columns = set(self._internal.column_labels) & set(right._internal.column_labels) exprs = [] data_columns = [] @@ -6282,9 +6591,9 @@ def merge(self, right: 'DataFrame', how: str = 'inner', if label in duplicate_columns: if label in left_keys and label in right_keys: # type: ignore right_scol = right_scol_for(label) - if how == 'right': + if how == "right": scol = right_scol - elif how == 'full': + elif how == "full": scol = F.when(scol.isNotNull(), scol).otherwise(right_scol).alias(col) else: pass @@ -6315,17 +6624,17 @@ def merge(self, right: 'DataFrame', how: str = 'inner', # Retain indices if they are used for joining if left_index: if right_index: - if how in ('inner', 'left'): + if how in ("inner", "left"): exprs.extend(left_index_scols) index_map = self._internal.index_map - elif how == 'right': + elif how == "right": exprs.extend(right_index_scols) index_map = right._internal.index_map else: index_map = [] - for (col, name), left_scol, right_scol in zip(self._internal.index_map, - left_index_scols, - right_index_scols): + for (col, name), left_scol, right_scol in zip( + self._internal.index_map, left_index_scols, right_index_scols + ): scol = F.when(left_scol.isNotNull(), left_scol).otherwise(right_scol) exprs.append(scol.alias(col)) index_map.append((col, name)) @@ -6340,16 +6649,22 @@ def merge(self, right: 'DataFrame', how: str = 'inner', selected_columns = joined_table.select(*exprs) - internal = _InternalFrame(sdf=selected_columns, - index_map=index_map if index_map else None, - column_labels=column_labels, - column_scols=[scol_for(selected_columns, col) - for col in data_columns]) + internal = _InternalFrame( + sdf=selected_columns, + index_map=index_map if index_map else None, + column_labels=column_labels, + column_scols=[scol_for(selected_columns, col) for col in data_columns], + ) return DataFrame(internal) - def join(self, right: 'DataFrame', - on: Optional[Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]]] = None, - how: str = 'left', lsuffix: str = '', rsuffix: str = '') -> 'DataFrame': + def join( + self, + right: "DataFrame", + on: Optional[Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]]] = None, + how: str = "left", + lsuffix: str = "", + rsuffix: str = "", + ) -> "DataFrame": """ Join columns of another DataFrame. @@ -6448,19 +6763,26 @@ def join(self, right: 'DataFrame', common = list(self.columns.intersection(right.columns)) if len(common) > 0 and not lsuffix and not rsuffix: raise ValueError( - "columns overlap but no suffix specified: " - "{rename}".format(rename=common)) + "columns overlap but no suffix specified: " "{rename}".format(rename=common) + ) if on: self = self.set_index(on) - join_kdf = self.merge(right, left_index=True, right_index=True, how=how, - suffixes=(lsuffix, rsuffix)).reset_index() + join_kdf = self.merge( + right, left_index=True, right_index=True, how=how, suffixes=(lsuffix, rsuffix) + ).reset_index() else: - join_kdf = self.merge(right, left_index=True, right_index=True, how=how, - suffixes=(lsuffix, rsuffix)) + join_kdf = self.merge( + right, left_index=True, right_index=True, how=how, suffixes=(lsuffix, rsuffix) + ) return join_kdf - def append(self, other: 'DataFrame', ignore_index: bool = False, - verify_integrity: bool = False, sort: bool = False) -> 'DataFrame': + def append( + self, + other: "DataFrame", + ignore_index: bool = False, + verify_integrity: bool = False, + sort: bool = False, + ) -> "DataFrame": """ Append rows of other to the end of caller, returning a new object. @@ -6513,17 +6835,20 @@ def append(self, other: 'DataFrame', ignore_index: bool = False, raise ValueError("Both DataFrames have to have the same number of index levels") if verify_integrity and len(index_scols) > 0: - if (self._sdf.select(index_scols) - .intersect(other._sdf.select(other._internal.index_scols)) - .count()) > 0: + if ( + self._sdf.select(index_scols) + .intersect(other._sdf.select(other._internal.index_scols)) + .count() + ) > 0: raise ValueError("Indices have overlapping values") # Lazy import to avoid circular dependency issues from databricks.koalas.namespace import concat + return concat([self, other], ignore_index=ignore_index) # TODO: add 'filter_func' and 'errors' parameter - def update(self, other: 'DataFrame', join: str = 'left', overwrite: bool = True): + def update(self, other: "DataFrame", join: str = "left", overwrite: bool = True): """ Modify in place using non-NA values from another DataFrame. Aligns on indices. There is no return value. @@ -6592,34 +6917,42 @@ def update(self, other: 'DataFrame', join: str = 'left', overwrite: bool = True) 1 2 500.0 2 3 6.0 """ - if join != 'left': + if join != "left": raise NotImplementedError("Only left join is supported") if isinstance(other, ks.Series): other = DataFrame(other) - update_columns = list(set(self._internal.column_labels) - .intersection(set(other._internal.column_labels))) - update_sdf = self.join(other[update_columns], rsuffix='_new')._sdf + update_columns = list( + set(self._internal.column_labels).intersection(set(other._internal.column_labels)) + ) + update_sdf = self.join(other[update_columns], rsuffix="_new")._sdf for column_labels in update_columns: column_name = self._internal.column_name_for(column_labels) old_col = scol_for(update_sdf, column_name) - new_col = scol_for(update_sdf, other._internal.column_name_for(column_labels) + '_new') + new_col = scol_for(update_sdf, other._internal.column_name_for(column_labels) + "_new") if overwrite: - update_sdf = update_sdf.withColumn(column_name, F.when(new_col.isNull(), old_col) - .otherwise(new_col)) + update_sdf = update_sdf.withColumn( + column_name, F.when(new_col.isNull(), old_col).otherwise(new_col) + ) else: - update_sdf = update_sdf.withColumn(column_name, F.when(old_col.isNull(), new_col) - .otherwise(old_col)) - sdf = update_sdf.select([scol_for(update_sdf, col) - for col in self._internal.columns] + - list(HIDDEN_COLUMNS)) + update_sdf = update_sdf.withColumn( + column_name, F.when(old_col.isNull(), new_col).otherwise(old_col) + ) + sdf = update_sdf.select( + [scol_for(update_sdf, col) for col in self._internal.columns] + list(HIDDEN_COLUMNS) + ) internal = self._internal.with_new_sdf(sdf) self._internal = internal - def sample(self, n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, - random_state: Optional[int] = None) -> 'DataFrame': + def sample( + self, + n: Optional[int] = None, + frac: Optional[float] = None, + replace: bool = False, + random_state: Optional[int] = None, + ) -> "DataFrame": """ Return a random sample of items from an axis of object. @@ -6689,8 +7022,10 @@ def sample(self, n: Optional[int] = None, frac: Optional[float] = None, replace: # Note: we don't run any of the doctests because the result can change depending on the # system's core count. if n is not None: - raise NotImplementedError("Function sample currently does not support specifying " - "exact number of items to return. Use frac instead.") + raise NotImplementedError( + "Function sample currently does not support specifying " + "exact number of items to return. Use frac instead." + ) if frac is None: raise ValueError("frac must be specified.") @@ -6698,7 +7033,7 @@ def sample(self, n: Optional[int] = None, frac: Optional[float] = None, replace: sdf = self._sdf.sample(withReplacement=replace, fraction=frac, seed=random_state) return DataFrame(self._internal.with_new_sdf(sdf)) - def astype(self, dtype) -> 'DataFrame': + def astype(self, dtype) -> "DataFrame": """ Cast a Koalas object to a specified dtype ``dtype``. @@ -6756,8 +7091,10 @@ def astype(self, dtype) -> 'DataFrame': if is_dict_like(dtype): for col_name in dtype.keys(): if col_name not in self.columns: - raise KeyError('Only a column name can be used for the ' - 'key in a dtype mappings argument.') + raise KeyError( + "Only a column name can be used for the " + "key in a dtype mappings argument." + ) for col_name, col in self.items(): if col_name in dtype: applied.append(col.astype(dtype=dtype[col_name])) @@ -6810,7 +7147,8 @@ def add_prefix(self, prefix): """ assert isinstance(prefix, str) return self._apply_series_op( - lambda kser: kser.rename(tuple([prefix + i for i in kser._internal.column_labels[0]]))) + lambda kser: kser.rename(tuple([prefix + i for i in kser._internal.column_labels[0]])) + ) def add_suffix(self, suffix): """ @@ -6854,10 +7192,11 @@ def add_suffix(self, suffix): """ assert isinstance(suffix, str) return self._apply_series_op( - lambda kser: kser.rename(tuple([i + suffix for i in kser._internal.column_labels[0]]))) + lambda kser: kser.rename(tuple([i + suffix for i in kser._internal.column_labels[0]])) + ) # TODO: include, and exclude should be implemented. - def describe(self, percentiles: Optional[List[float]] = None) -> 'DataFrame': + def describe(self, percentiles: Optional[List[float]] = None) -> "DataFrame": """ Generate descriptive statistics that summarize the central tendency, dispersion and shape of a dataset's distribution, excluding @@ -7004,8 +7343,9 @@ def describe(self, percentiles: Optional[List[float]] = None) -> 'DataFrame': scol = self._internal.scol_for(label) spark_type = self._internal.spark_type_for(label) if isinstance(spark_type, DoubleType) or isinstance(spark_type, FloatType): - exprs.append(F.nanvl(scol, F.lit(None)) - .alias(self._internal.column_name_for(label))) + exprs.append( + F.nanvl(scol, F.lit(None)).alias(self._internal.column_name_for(label)) + ) column_labels.append(label) elif isinstance(spark_type, NumericType): exprs.append(scol) @@ -7026,14 +7366,17 @@ def describe(self, percentiles: Optional[List[float]] = None) -> 'DataFrame': stats = ["count", "mean", "stddev", "min", *formatted_perc, "max"] sdf = self._sdf.select(*exprs).summary(stats) - sdf = sdf.replace("stddev", "std", subset='summary') + sdf = sdf.replace("stddev", "std", subset="summary") - internal = _InternalFrame(sdf=sdf, - index_map=[('summary', None)], - column_labels=column_labels, - column_scols=[scol_for(sdf, self._internal.column_name_for(label)) - for label in column_labels]) - return DataFrame(internal).astype('float64') + internal = _InternalFrame( + sdf=sdf, + index_map=[("summary", None)], + column_labels=column_labels, + column_scols=[ + scol_for(sdf, self._internal.column_name_for(label)) for label in column_labels + ], + ) + return DataFrame(internal).astype("float64") # TODO: implements 'keep' parameters def drop_duplicates(self, subset=None, inplace=False): @@ -7093,17 +7436,24 @@ def drop_duplicates(self, subset=None, inplace=False): else: subset = [sub if isinstance(sub, tuple) else (sub,) for sub in subset] - sdf = self._sdf.drop(*HIDDEN_COLUMNS) \ - .drop_duplicates(subset=[self._internal.column_name_for(label) for label in subset]) + sdf = self._sdf.drop(*HIDDEN_COLUMNS).drop_duplicates( + subset=[self._internal.column_name_for(label) for label in subset] + ) internal = self._internal.with_new_sdf(sdf) if inplace: self._internal = internal else: return DataFrame(internal) - def reindex(self, labels: Optional[Any] = None, index: Optional[Any] = None, - columns: Optional[Any] = None, axis: Optional[Union[int, str]] = None, - copy: Optional[bool] = True, fill_value: Optional[Any] = None) -> 'DataFrame': + def reindex( + self, + labels: Optional[Any] = None, + index: Optional[Any] = None, + columns: Optional[Any] = None, + axis: Optional[Union[int, str]] = None, + copy: Optional[bool] = True, + fill_value: Optional[Any] = None, + ) -> "DataFrame": """ Conform DataFrame to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index. A new object @@ -7258,12 +7608,16 @@ def reindex(self, labels: Optional[Any] = None, index: Optional[Any] = None, raise ValueError("No axis named %s for object type %s." % (axis, type(axis))) if index is not None and not is_list_like(index): - raise TypeError("Index must be called with a collection of some kind, " - "%s was passed" % type(index)) + raise TypeError( + "Index must be called with a collection of some kind, " + "%s was passed" % type(index) + ) if columns is not None and not is_list_like(columns): - raise TypeError("Columns must be called with a collection of some kind, " - "%s was passed" % type(columns)) + raise TypeError( + "Columns must be called with a collection of some kind, " + "%s was passed" % type(columns) + ) df = self.copy() @@ -7293,8 +7647,9 @@ def _reindex_index(self, index): kser = ks.Series(list(index)) labels = kser._internal._sdf.select(kser._scol.alias(index_column)) - joined_df = self._sdf.drop(NATURAL_ORDER_COLUMN_NAME) \ - .join(labels, on=index_column, how="right") + joined_df = self._sdf.drop(NATURAL_ORDER_COLUMN_NAME).join( + labels, on=index_column, how="right" + ) internal = self._internal.with_new_sdf(joined_df) return internal @@ -7305,13 +7660,14 @@ def _reindex_columns(self, columns): label_columns = list(columns) for col in label_columns: if not isinstance(col, tuple): - raise TypeError('Expected tuple, got {}'.format(type(col))) + raise TypeError("Expected tuple, got {}".format(type(col))) else: label_columns = [(col,) for col in columns] for col in label_columns: if len(col) != level: - raise ValueError("shape (1,{}) doesn't match the shape (1,{})" - .format(len(col), level)) + raise ValueError( + "shape (1,{}) doesn't match the shape (1,{})".format(len(col), level) + ) scols, labels = [], [] for label in label_columns: if label in self._internal.column_labels: @@ -7322,8 +7678,7 @@ def _reindex_columns(self, columns): return self._internal.with_new_columns(scols, column_labels=labels) - def melt(self, id_vars=None, value_vars=None, var_name=None, - value_name='value'): + def melt(self, id_vars=None, value_vars=None, var_name=None, value_name="value"): """ Unpivot a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -7422,26 +7777,29 @@ def melt(self, id_vars=None, value_vars=None, var_name=None, id_vars = [(id_vars,)] elif isinstance(id_vars, tuple): if self._internal.column_labels_level == 1: - id_vars = [idv if isinstance(idv, tuple) else (idv,) - for idv in id_vars] + id_vars = [idv if isinstance(idv, tuple) else (idv,) for idv in id_vars] else: - raise ValueError('id_vars must be a list of tuples' - ' when columns are a MultiIndex') + raise ValueError( + "id_vars must be a list of tuples" " when columns are a MultiIndex" + ) else: - id_vars = [idv if isinstance(idv, tuple) else (idv,) - for idv in id_vars] + id_vars = [idv if isinstance(idv, tuple) else (idv,) for idv in id_vars] non_existence_col = [idv for idv in id_vars if idv not in column_labels] if len(non_existence_col) != 0: raveled_column_labels = np.ravel(column_labels) - missing = [nec for nec in np.ravel(non_existence_col) - if nec not in raveled_column_labels] + missing = [ + nec for nec in np.ravel(non_existence_col) if nec not in raveled_column_labels + ] if len(missing) != 0: - raise KeyError("The following 'id_vars' are not present" - " in the DataFrame: {}".format(missing)) + raise KeyError( + "The following 'id_vars' are not present" + " in the DataFrame: {}".format(missing) + ) else: - raise KeyError("None of {} are in the {}" - .format(non_existence_col, column_labels)) + raise KeyError( + "None of {} are in the {}".format(non_existence_col, column_labels) + ) if value_vars is None: value_vars = [] @@ -7450,26 +7808,31 @@ def melt(self, id_vars=None, value_vars=None, var_name=None, value_vars = [(value_vars,)] elif isinstance(value_vars, tuple): if self._internal.column_labels_level == 1: - value_vars = [valv if isinstance(valv, tuple) else (valv,) - for valv in value_vars] + value_vars = [ + valv if isinstance(valv, tuple) else (valv,) for valv in value_vars + ] else: - raise ValueError('value_vars must be a list of tuples' - ' when columns are a MultiIndex') + raise ValueError( + "value_vars must be a list of tuples" " when columns are a MultiIndex" + ) else: - value_vars = [valv if isinstance(valv, tuple) else (valv,) - for valv in value_vars] + value_vars = [valv if isinstance(valv, tuple) else (valv,) for valv in value_vars] non_existence_col = [valv for valv in value_vars if valv not in column_labels] if len(non_existence_col) != 0: raveled_column_labels = np.ravel(column_labels) - missing = [nec for nec in np.ravel(non_existence_col) - if nec not in raveled_column_labels] + missing = [ + nec for nec in np.ravel(non_existence_col) if nec not in raveled_column_labels + ] if len(missing) != 0: - raise KeyError("The following 'value_vars' are not present" - " in the DataFrame: {}".format(missing)) + raise KeyError( + "The following 'value_vars' are not present" + " in the DataFrame: {}".format(missing) + ) else: - raise KeyError("None of {} are in the {}" - .format(non_existence_col, column_labels)) + raise KeyError( + "None of {} are in the {}".format(non_existence_col, column_labels) + ) if len(value_vars) == 0: value_vars = column_labels @@ -7482,24 +7845,34 @@ def melt(self, id_vars=None, value_vars=None, var_name=None, if self._internal.column_label_names is not None: var_name = self._internal.column_label_names elif self._internal.column_labels_level == 1: - var_name = ['variable'] + var_name = ["variable"] else: - var_name = ['variable_{}'.format(i) - for i in range(self._internal.column_labels_level)] + var_name = [ + "variable_{}".format(i) for i in range(self._internal.column_labels_level) + ] elif isinstance(var_name, str): var_name = [var_name] - pairs = F.explode(F.array(*[ - F.struct(*( - [F.lit(c).alias(name) for c, name in zip(label, var_name)] + - [self._internal.scol_for(label).alias(value_name)]) - ) for label in column_labels if label in value_vars])) - - columns = ([self._internal.scol_for(label).alias(name_like_string(label)) - for label in id_vars] + - [F.col("pairs.%s" % name) - for name in var_name[:self._internal.column_labels_level]] + - [F.col("pairs.%s" % value_name)]) + pairs = F.explode( + F.array( + *[ + F.struct( + *( + [F.lit(c).alias(name) for c, name in zip(label, var_name)] + + [self._internal.scol_for(label).alias(value_name)] + ) + ) + for label in column_labels + if label in value_vars + ] + ) + ) + + columns = ( + [self._internal.scol_for(label).alias(name_like_string(label)) for label in id_vars] + + [F.col("pairs.%s" % name) for name in var_name[: self._internal.column_labels_level]] + + [F.col("pairs.%s" % value_name)] + ) exploded_df = sdf.withColumn("pairs", pairs).select(columns) return DataFrame(exploded_df) @@ -7556,7 +7929,7 @@ def all(self, axis: Union[int, str] = 0) -> bool: column_labels = self._internal.column_labels for label in column_labels: col = self[label]._scol - all_col = F.min(F.coalesce(col.cast('boolean'), F.lit(True))) + all_col = F.min(F.coalesce(col.cast("boolean"), F.lit(True))) applied.append(F.when(all_col.isNull(), True).otherwise(all_col)) # TODO: there is a similar logic to transpose in, for instance, @@ -7565,25 +7938,32 @@ def all(self, axis: Union[int, str] = 0) -> bool: value_column = "value" cols = [] for label, applied_col in zip(column_labels, applied): - cols.append(F.struct( - [F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) for i, col in enumerate(label)] + - [applied_col.alias(value_column)])) + cols.append( + F.struct( + [F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) for i, col in enumerate(label)] + + [applied_col.alias(value_column)] + ) + ) - sdf = sdf.select( - F.array(*cols).alias("arrays") - ).select(F.explode(F.col("arrays"))) + sdf = sdf.select(F.array(*cols).alias("arrays")).select(F.explode(F.col("arrays"))) sdf = sdf.selectExpr("col.*") - index_column_name = lambda i: (None if self._internal.column_label_names is None - else (self._internal.column_label_names[i],)) + index_column_name = lambda i: ( + None + if self._internal.column_label_names is None + else (self._internal.column_label_names[i],) + ) internal = self._internal.copy( sdf=sdf, - index_map=[(SPARK_INDEX_NAME_FORMAT(i), index_column_name(i)) - for i in range(self._internal.column_labels_level)], + index_map=[ + (SPARK_INDEX_NAME_FORMAT(i), index_column_name(i)) + for i in range(self._internal.column_labels_level) + ], column_labels=None, column_scols=[scol_for(sdf, value_column)], - column_label_names=None) + column_label_names=None, + ) return DataFrame(internal)[value_column].rename("all") @@ -7639,7 +8019,7 @@ def any(self, axis: Union[int, str] = 0) -> bool: column_labels = self._internal.column_labels for label in column_labels: col = self[label]._scol - all_col = F.max(F.coalesce(col.cast('boolean'), F.lit(False))) + all_col = F.max(F.coalesce(col.cast("boolean"), F.lit(False))) applied.append(F.when(all_col.isNull(), False).otherwise(all_col)) # TODO: there is a similar logic to transpose in, for instance, @@ -7648,30 +8028,37 @@ def any(self, axis: Union[int, str] = 0) -> bool: value_column = "value" cols = [] for label, applied_col in zip(column_labels, applied): - cols.append(F.struct( - [F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) for i, col in enumerate(label)] + - [applied_col.alias(value_column)])) + cols.append( + F.struct( + [F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) for i, col in enumerate(label)] + + [applied_col.alias(value_column)] + ) + ) - sdf = sdf.select( - F.array(*cols).alias("arrays") - ).select(F.explode(F.col("arrays"))) + sdf = sdf.select(F.array(*cols).alias("arrays")).select(F.explode(F.col("arrays"))) sdf = sdf.selectExpr("col.*") - index_column_name = lambda i: (None if self._internal.column_label_names is None - else (self._internal.column_label_names[i],)) + index_column_name = lambda i: ( + None + if self._internal.column_label_names is None + else (self._internal.column_label_names[i],) + ) internal = self._internal.copy( sdf=sdf, - index_map=[(SPARK_INDEX_NAME_FORMAT(i), index_column_name(i)) - for i in range(self._internal.column_labels_level)], + index_map=[ + (SPARK_INDEX_NAME_FORMAT(i), index_column_name(i)) + for i in range(self._internal.column_labels_level) + ], column_labels=None, column_scols=[scol_for(sdf, value_column)], - column_label_names=None) + column_label_names=None, + ) return DataFrame(internal)[value_column].rename("any") # TODO: add axis, numeric_only, pct, na_option parameter - def rank(self, method='average', ascending=True): + def rank(self, method="average", ascending=True): """ Compute numerical data ranks (1 through n) along axis. Equal values are assigned a rank that is the average of the ranks of those values. @@ -7804,8 +8191,8 @@ def filter(self, items=None, like=None, regex=None, axis=None): if sum(x is not None for x in (items, like, regex)) > 1: raise TypeError( - "Keyword arguments `items`, `like`, or `regex` " - "are mutually exclusive") + "Keyword arguments `items`, `like`, or `regex` " "are mutually exclusive" + ) axis = validate_axis(axis, none_axis=1) @@ -7848,20 +8235,25 @@ def filter(self, items=None, like=None, regex=None, axis=None): elif axis == 1: column_labels = self._internal.column_labels matcher = re.compile(regex) - output_labels = [label for label in column_labels - if any(matcher.search(i) is not None for i in label)] + output_labels = [ + label + for label in column_labels + if any(matcher.search(i) is not None for i in label) + ] return self[output_labels] else: raise TypeError("Must pass either `items`, `like`, or `regex`") - def rename(self, - mapper=None, - index=None, - columns=None, - axis='index', - inplace=False, - level=None, - errors='ignore'): + def rename( + self, + mapper=None, + index=None, + columns=None, + axis="index", + inplace=False, + level=None, + errors="ignore", + ): """ Alter axes labels. @@ -7950,8 +8342,8 @@ def rename(self, def gen_mapper_fn(mapper): if isinstance(mapper, dict): if len(mapper) == 0: - if errors == 'raise': - raise KeyError('Index include label which is not in the `mapper`.') + if errors == "raise": + raise KeyError("Index include label which is not in the `mapper`.") else: return DataFrame(self._internal) @@ -7964,17 +8356,21 @@ def mapper_fn(x): if x in mapper: return mapper[x] else: - if errors == 'raise': - raise KeyError('Index include value which is not in the `mapper`') + if errors == "raise": + raise KeyError("Index include value which is not in the `mapper`") return x + elif callable(mapper): spark_return_type = _infer_return_type(mapper).tpe def mapper_fn(x): return mapper(x) + else: - raise ValueError("`mapper` or `index` or `columns` should be " - "either dict-like or function type.") + raise ValueError( + "`mapper` or `index` or `columns` should be " + "either dict-like or function type." + ) return mapper_fn, spark_return_type index_mapper_fn = None @@ -7989,8 +8385,10 @@ def mapper_fn(x): elif axis == 1: columns_mapper_fn, columns_mapper_ret_stype = gen_mapper_fn(mapper) else: - raise ValueError("argument axis should be either the axis name " - "(‘index’, ‘columns’) or number (0, 1)") + raise ValueError( + "argument axis should be either the axis name " + "(‘index’, ‘columns’) or number (0, 1)" + ) else: if index: index_mapper_fn, index_mapper_ret_stype = gen_mapper_fn(index) @@ -8024,8 +8422,9 @@ def mapper_fn(x): def gen_new_index_column(level): index_col_name = index_columns[level] - index_mapper_udf = pandas_udf(lambda s: s.map(index_mapper_fn), - returnType=index_mapper_ret_stype) + index_mapper_udf = pandas_udf( + lambda s: s.map(index_mapper_fn), returnType=index_mapper_ret_stype + ) return index_mapper_udf(scol_for(internal.sdf, index_col_name)) sdf = internal.sdf @@ -8062,9 +8461,10 @@ def gen_new_column_labels_entry(column_labels_entry): new_data_columns = [col[0] for col in new_column_labels] else: new_data_columns = [str(col) for col in new_column_labels] - new_data_scols = [scol_for(internal.sdf, old_col_name).alias(new_col_name) - for old_col_name, new_col_name - in zip(internal.data_columns, new_data_columns)] + new_data_scols = [ + scol_for(internal.sdf, old_col_name).alias(new_col_name) + for old_col_name, new_col_name in zip(internal.data_columns, new_data_columns) + ] internal = internal.with_new_columns(new_data_scols, column_labels=new_column_labels) if inplace: self._internal = internal @@ -8301,9 +8701,7 @@ def idxmin(self, axis=0): return ks.from_pandas(pdf.idxmin()) - def info( - self, verbose=None, buf=None, max_cols=None, null_counts=None - ): + def info(self, verbose=None, buf=None, max_cols=None, null_counts=None): """ Print a concise summary of a DataFrame. @@ -8398,13 +8796,18 @@ def info( # To avoid pandas' existing config affects Koalas. # TODO: should we have corresponding Koalas configs? with pd.option_context( - 'display.max_info_columns', sys.maxsize, - 'display.max_info_rows', sys.maxsize): + "display.max_info_columns", sys.maxsize, "display.max_info_rows", sys.maxsize + ): try: self._data = self # hack to use pandas' info as is. return pd.DataFrame.info( - self, verbose=verbose, buf=buf, max_cols=max_cols, - memory_usage=False, null_counts=null_counts) + self, + verbose=verbose, + buf=buf, + max_cols=max_cols, + memory_usage=False, + null_counts=null_counts, + ) finally: del self._data @@ -8485,9 +8888,11 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, accuracy=10000): percentile_cols = [] for column in self._internal.data_columns: - percentile_cols.append(F.expr( - "approx_percentile(`%s`, array(%s), %s)" % (column, args, accuracy)) - .alias(column)) + percentile_cols.append( + F.expr("approx_percentile(`%s`, array(%s), %s)" % (column, args, accuracy)).alias( + column + ) + ) sdf = sdf.select(percentile_cols) # Here, after select percntile cols, a sdf looks like below: # +---------+---------+ @@ -8505,9 +8910,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, accuracy=10000): internal_index_column = SPARK_DEFAULT_INDEX_NAME cols = [] for i, col in enumerate(zip(*cols_dict.values())): - cols.append(F.struct( - F.lit("%s" % quantiles[i]).alias(internal_index_column), - *col)) + cols.append(F.struct(F.lit("%s" % quantiles[i]).alias(internal_index_column), *col)) sdf = sdf.select(F.array(*cols).alias("arrays")) # And then, explode it and manually set the index. @@ -8525,7 +8928,8 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, accuracy=10000): column_scols=[scol_for(sdf, col) for col in self._internal.data_columns], index_map=[(internal_index_column, None)], column_labels=self._internal.column_labels, - column_label_names=None) + column_label_names=None, + ) return DataFrame(internal) if not result_as_series else DataFrame(internal).T[key] @@ -8611,16 +9015,14 @@ def query(self, expr, inplace=False): if isinstance(self.columns, pd.MultiIndex): raise ValueError("Doesn't support for MultiIndex columns") if not isinstance(expr, str): - raise ValueError( - 'expr must be a string to be evaluated, {} given' - .format(type(expr))) + raise ValueError("expr must be a string to be evaluated, {} given".format(type(expr))) inplace = validate_bool_kwarg(inplace, "inplace") data_columns = [label[0] for label in self._internal.column_labels] - sdf = self._sdf.select(self._internal.index_scols - + [scol.alias(col) for scol, col - in zip(self._internal.column_scols, data_columns)]) \ - .filter(expr) + sdf = self._sdf.select( + self._internal.index_scols + + [scol.alias(col) for scol, col in zip(self._internal.column_scols, data_columns)] + ).filter(expr) internal = self._internal.with_new_sdf(sdf, data_columns=data_columns) if inplace: @@ -8679,8 +9081,9 @@ def __repr__(self): if match is not None: nrows = match.group("rows") ncols = match.group("columns") - footer = ("\n\n[Showing only the first {nrows} rows x {ncols} columns]" - .format(nrows=nrows, ncols=ncols)) + footer = "\n\n[Showing only the first {nrows} rows x {ncols} columns]".format( + nrows=nrows, ncols=ncols + ) return REPR_PATTERN.sub(footer, repr_string) return pdf.to_string() @@ -8702,15 +9105,16 @@ def _repr_html_(self): nrows = match.group("rows") ncols = match.group("columns") by = chr(215) - footer = ('\n

Showing only the first {rows} rows {by} {cols} columns

\n' - .format(rows=nrows, - by=by, - cols=ncols)) + footer = ( + "\n

Showing only the first {rows} rows " + "{by} {cols} columns

\n".format(rows=nrows, by=by, cols=ncols) + ) return REPR_HTML_PATTERN.sub(footer, repr_html) return pdf.to_html(notebook=True, bold_rows=bold_rows) def __getitem__(self, key): from databricks.koalas.series import Series + if key is None: raise KeyError("none key") if isinstance(key, (str, tuple, list)): @@ -8724,8 +9128,9 @@ def __getitem__(self, key): def __setitem__(self, key, value): from databricks.koalas.series import Series - if (isinstance(value, Series) and value._kdf is not self) or \ - (isinstance(value, DataFrame) and value is not self): + if (isinstance(value, Series) and value._kdf is not self) or ( + isinstance(value, DataFrame) and value is not self + ): # Different Series or DataFrames if isinstance(value, Series): value = value.to_frame() @@ -8735,8 +9140,11 @@ def __setitem__(self, key, value): level = self._internal.column_labels_level value.columns = pd.MultiIndex.from_tuples( - [tuple([name_like_string(label)] + ([''] * (level - 1))) - for label in value._internal.column_labels]) + [ + tuple([name_like_string(label)] + ([""] * (level - 1))) + for label in value._internal.column_labels + ] + ) if isinstance(key, str): key = [(key,)] @@ -8746,17 +9154,21 @@ def __setitem__(self, key, value): key = [k if isinstance(k, tuple) else (k,) for k in key] if any(len(label) > level for label in key): - raise KeyError('Key length ({}) exceeds index depth ({})' - .format(max(len(label) for label in key), level)) - key = [tuple(list(label) + ([''] * (level - len(label)))) for label in key] + raise KeyError( + "Key length ({}) exceeds index depth ({})".format( + max(len(label) for label in key), level + ) + ) + key = [tuple(list(label) + ([""] * (level - len(label)))) for label in key] def assign_columns(kdf, this_column_labels, that_column_labels): assert len(key) == len(that_column_labels) # Note that here intentionally uses `zip_longest` that combine # that_columns. - for k, this_label, that_label \ - in zip_longest(key, this_column_labels, that_column_labels): - yield (kdf[that_label], tuple(['that', *k])) + for k, this_label, that_label in zip_longest( + key, this_column_labels, that_column_labels + ): + yield (kdf[that_label], tuple(["that", *k])) if this_label is not None and this_label[1:] != k: yield (kdf[this_label], this_label) @@ -8786,13 +9198,14 @@ def __getattr__(self, key: str) -> Any: return self.loc[:, key] except KeyError: raise AttributeError( - "'%s' object has no attribute '%s'" % (self.__class__.__name__, key)) + "'%s' object has no attribute '%s'" % (self.__class__.__name__, key) + ) def __len__(self): return self._sdf.count() def __dir__(self): - fields = [f for f in self._sdf.schema.fieldNames() if ' ' not in f] + fields = [f for f in self._sdf.schema.fieldNames() if " " not in f] return super(DataFrame, self).__dir__() + fields def __iter__(self): @@ -8801,14 +9214,15 @@ def __iter__(self): # NDArray Compat def __array_ufunc__(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): # TODO: is it possible to deduplicate it with '_map_series_op'? - if (all(isinstance(inp, DataFrame) for inp in inputs) - and any(inp is not inputs[0] for inp in inputs)): + if all(isinstance(inp, DataFrame) for inp in inputs) and any( + inp is not inputs[0] for inp in inputs + ): # binary only assert len(inputs) == 2 this = inputs[0] that = inputs[1] if this._internal.column_labels_level != that._internal.column_labels_level: - raise ValueError('cannot join with no overlapping index names') + raise ValueError("cannot join with no overlapping index names") # Different DataFrames def apply_op(kdf, this_column_labels, that_column_labels): @@ -8833,11 +9247,13 @@ def apply_op(kdf, this_column_labels, that_column_labels): return DataFrame(internal) if sys.version_info >= (3, 7): + def __class_getitem__(cls, params): # This is a workaround to support variadic generic in DataFrame in Python 3.7. # See https://github.com/python/typing/issues/193 # we always wraps the given type hints by a tuple to mimic the variadic generic. return super(cls, DataFrame).__class_getitem__(Tuple[params]) + elif (3, 5) <= sys.version_info < (3, 7): # This is a workaround to support variadic generic in DataFrame in Python 3.5+ # The implementation is in its metaclass so this flag is needed to distinguish @@ -8864,6 +9280,7 @@ class _CachedDataFrame(DataFrame): Cached Koalas DataFrame, which corresponds to Pandas DataFrame logically, but internally it caches the corresponding Spark DataFrame. """ + def __init__(self, internal): self._cached = internal._sdf.cache() super(_CachedDataFrame, self).__init__(internal) diff --git a/databricks/koalas/generic.py b/databricks/koalas/generic.py index 6e7e519..1509038 100644 --- a/databricks/koalas/generic.py +++ b/databricks/koalas/generic.py @@ -337,7 +337,8 @@ def get_dtype_counts(self): "`get_dtype_counts` has been deprecated and will be " "removed in a future version. For DataFrames use " "`.dtypes.value_counts()", - FutureWarning) + FutureWarning, + ) if not isinstance(self.dtypes, Iterable): dtypes = [self.dtypes] else: @@ -431,8 +432,7 @@ def pipe(self, func, *args, **kwargs): if isinstance(func, tuple): func, target = func if target in kwargs: - raise ValueError('%s is both the pipe target and a keyword ' - 'argument' % target) + raise ValueError("%s is both the pipe target and a keyword " "argument" % target) kwargs[target] = self return func(*args, **kwargs) else: @@ -451,9 +451,19 @@ def to_numpy(self): """ return self.to_pandas().values - def to_csv(self, path=None, sep=',', na_rep='', columns=None, header=True, - quotechar='"', date_format=None, escapechar=None, num_files=None, - **options): + def to_csv( + self, + path=None, + sep=",", + na_rep="", + columns=None, + header=True, + quotechar='"', + date_format=None, + escapechar=None, + num_files=None, + **options + ): r""" Write object to a comma-separated values (csv) file. @@ -548,17 +558,30 @@ def to_csv(self, path=None, sep=',', na_rep='', columns=None, header=True, if path is None: # If path is none, just collect and use pandas's to_csv. kdf_or_ser = self - if (LooseVersion("0.24") > LooseVersion(pd.__version__)) and \ - isinstance(self, ks.Series): + if (LooseVersion("0.24") > LooseVersion(pd.__version__)) and isinstance( + self, ks.Series + ): # 0.23 seems not having 'columns' parameter in Series' to_csv. return kdf_or_ser.to_pandas().to_csv( - None, sep=sep, na_rep=na_rep, header=header, - date_format=date_format, index=False) + None, + sep=sep, + na_rep=na_rep, + header=header, + date_format=date_format, + index=False, + ) else: return kdf_or_ser.to_pandas().to_csv( - None, sep=sep, na_rep=na_rep, columns=columns, - header=header, quotechar=quotechar, - date_format=date_format, escapechar=escapechar, index=False) + None, + sep=sep, + na_rep=na_rep, + columns=columns, + header=header, + quotechar=quotechar, + date_format=date_format, + escapechar=escapechar, + index=False, + ) kdf = self if isinstance(self, ks.Series): @@ -574,11 +597,14 @@ def to_csv(self, path=None, sep=',', na_rep='', columns=None, header=True, column_labels = [label if isinstance(label, tuple) else (label,) for label in columns] if header is True and kdf._internal.column_labels_level > 1: - raise ValueError('to_csv only support one-level index column now') + raise ValueError("to_csv only support one-level index column now") elif isinstance(header, list): sdf = kdf._sdf.select( - [self._internal.scol_for(label).alias(new_name) - for (label, new_name) in zip(column_labels, header)]) + [ + self._internal.scol_for(label).alias(new_name) + for (label, new_name) in zip(column_labels, header) + ] + ) header = True else: sdf = kdf._sdf.select([kdf._internal.scol_for(label) for label in column_labels]) @@ -589,12 +615,17 @@ def to_csv(self, path=None, sep=',', na_rep='', columns=None, header=True, builder = sdf.write.mode("overwrite") OptionUtils._set_opts( builder, - path=path, sep=sep, nullValue=na_rep, header=header, - quote=quotechar, dateFormat=date_format, - charToEscapeQuoteEscaping=escapechar) + path=path, + sep=sep, + nullValue=na_rep, + header=header, + quote=quotechar, + dateFormat=date_format, + charToEscapeQuoteEscaping=escapechar, + ) builder.options(**options).format("csv").save(path) - def to_json(self, path=None, compression='uncompressed', num_files=None, **options): + def to_json(self, path=None, compression="uncompressed", num_files=None, **options): """ Convert the object to a JSON string. @@ -663,7 +694,7 @@ def to_json(self, path=None, compression='uncompressed', num_files=None, **optio pdf = pdf.to_frame() # To make the format consistent and readable by `read_json`, convert it to pandas' and # use 'records' orient for now. - return pdf.to_json(orient='records') + return pdf.to_json(orient="records") kdf = self if isinstance(self, ks.Series): @@ -677,10 +708,25 @@ def to_json(self, path=None, compression='uncompressed', num_files=None, **optio OptionUtils._set_opts(builder, compression=compression) builder.options(**options).format("json").save(path) - def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", float_format=None, - columns=None, header=True, index=True, index_label=None, startrow=0, - startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep="inf", - verbose=True, freeze_panes=None): + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + freeze_panes=None, + ): """ Write object to an Excel sheet. @@ -787,10 +833,12 @@ def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", float_format=No elif isinstance(self, ks.Series): f = pd.Series.to_excel else: - raise TypeError('Constructor expects DataFrame or Series; however, ' - 'got [%s]' % (self,)) + raise TypeError( + "Constructor expects DataFrame or Series; however, " "got [%s]" % (self,) + ) return validate_arguments_and_invoke_function( - kdf._to_internal_pandas(), self.to_excel, f, args) + kdf._to_internal_pandas(), self.to_excel, f, args + ) def mean(self, axis=None, numeric_only=True): """ @@ -834,7 +882,8 @@ def mean(self, axis=None, numeric_only=True): 2.0 """ return self._reduce_for_stat_function( - F.mean, name="mean", numeric_only=numeric_only, axis=axis) + F.mean, name="mean", numeric_only=numeric_only, axis=axis + ) def sum(self, axis=None, numeric_only=True): """ @@ -878,7 +927,8 @@ def sum(self, axis=None, numeric_only=True): 6.0 """ return self._reduce_for_stat_function( - F.sum, name="sum", numeric_only=numeric_only, axis=axis) + F.sum, name="sum", numeric_only=numeric_only, axis=axis + ) def skew(self, axis=None, numeric_only=True): """ @@ -915,7 +965,8 @@ def skew(self, axis=None, numeric_only=True): 0.0 """ return self._reduce_for_stat_function( - F.skewness, name="skew", numeric_only=numeric_only, axis=axis) + F.skewness, name="skew", numeric_only=numeric_only, axis=axis + ) def kurtosis(self, axis=None, numeric_only=True): """ @@ -953,7 +1004,8 @@ def kurtosis(self, axis=None, numeric_only=True): -1.5 """ return self._reduce_for_stat_function( - F.kurtosis, name="kurtosis", numeric_only=numeric_only, axis=axis) + F.kurtosis, name="kurtosis", numeric_only=numeric_only, axis=axis + ) kurt = kurtosis @@ -999,7 +1051,8 @@ def min(self, axis=None, numeric_only=False): 1.0 """ return self._reduce_for_stat_function( - F.min, name="min", numeric_only=numeric_only, axis=axis) + F.min, name="min", numeric_only=numeric_only, axis=axis + ) def max(self, axis=None, numeric_only=False): """ @@ -1043,7 +1096,8 @@ def max(self, axis=None, numeric_only=False): 3.0 """ return self._reduce_for_stat_function( - F.max, name="max", numeric_only=numeric_only, axis=axis) + F.max, name="max", numeric_only=numeric_only, axis=axis + ) def std(self, axis=None, numeric_only=True): """ @@ -1087,7 +1141,8 @@ def std(self, axis=None, numeric_only=True): 1.0 """ return self._reduce_for_stat_function( - F.stddev, name="std", numeric_only=numeric_only, axis=axis) + F.stddev, name="std", numeric_only=numeric_only, axis=axis + ) def var(self, axis=None, numeric_only=True): """ @@ -1131,7 +1186,8 @@ def var(self, axis=None, numeric_only=True): 1.0 """ return self._reduce_for_stat_function( - F.variance, name="var", numeric_only=numeric_only, axis=axis) + F.variance, name="var", numeric_only=numeric_only, axis=axis + ) @property def size(self) -> int: @@ -1191,7 +1247,8 @@ def abs(self): """ # TODO: The first example above should not have "Name: 0". return self._apply_series_op( - lambda kser: kser._with_new_scol(F.abs(kser._scol)).rename(kser.name)) + lambda kser: kser._with_new_scol(F.abs(kser._scol)).rename(kser.name) + ) # TODO: by argument only support the grouping name and as_index only for now. Documentation # should be updated when it's supported. @@ -1284,7 +1341,7 @@ def groupby(self, by, axis=0, as_index: bool = True): else: raise ValueError("Grouper for '{}' not 1-dimensional".format(type(by))) if not len(by): - raise ValueError('No group keys passed!') + raise ValueError("No group keys passed!") axis = validate_axis(axis) if axis != 0: raise NotImplementedError('axis should be either 0 or "index" currently.') @@ -1297,8 +1354,9 @@ def groupby(self, by, axis=0, as_index: bool = True): anchor = df_or_s._kdf col_by = [_resolve_col(anchor, col_or_s) for col_or_s in by] return SeriesGroupBy(col, col_by, as_index=as_index) - raise TypeError('Constructor expects DataFrame or Series; however, ' - 'got [%s]' % (df_or_s,)) + raise TypeError( + "Constructor expects DataFrame or Series; however, " "got [%s]" % (df_or_s,) + ) def bool(self): """ @@ -1339,8 +1397,7 @@ def bool(self): elif isinstance(self, ks.Series): df = self.to_dataframe() else: - raise TypeError('bool() expects DataFrame or Series; however, ' - 'got [%s]' % (self,)) + raise TypeError("bool() expects DataFrame or Series; however, " "got [%s]" % (self,)) return df.head(2)._to_internal_pandas().bool() def first_valid_index(self): @@ -1422,12 +1479,12 @@ def first_valid_index(self): """ sdf = self._internal.sdf column_scols = self._internal.column_scols - cond = reduce(lambda x, y: x & y, - map(lambda x: x.isNotNull(), column_scols)) + cond = reduce(lambda x, y: x & y, map(lambda x: x.isNotNull(), column_scols)) first_valid_row = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond).first() - first_valid_idx = tuple(first_valid_row[idx_col] - for idx_col in self._internal.index_columns) + first_valid_idx = tuple( + first_valid_row[idx_col] for idx_col in self._internal.index_columns + ) if len(first_valid_idx) == 1: first_valid_idx = first_valid_idx[0] @@ -1514,9 +1571,11 @@ def median(self, accuracy=10000): if isinstance(kdf_or_kser, Series): kser = _col(kdf_or_kser.to_frame()) return kser._reduce_for_stat_function( - lambda _: F.expr("approx_percentile(`%s`, 0.5, %s)" - % (kser._internal.data_columns[0], accuracy)), - name="median") + lambda _: F.expr( + "approx_percentile(`%s`, 0.5, %s)" % (kser._internal.data_columns[0], accuracy) + ), + name="median", + ) assert isinstance(kdf_or_kser, DataFrame) # This code path cannot reuse `_reduce_for_stat_function` since there looks no proper way @@ -1527,14 +1586,21 @@ def median(self, accuracy=10000): sdf = sdf.select([median(col).alias(col) for col in kdf._internal.data_columns]) # Attach a dummy column for index to avoid default index. - sdf = sdf.withColumn('__DUMMY__', F.monotonically_increasing_id()) + sdf = sdf.withColumn("__DUMMY__", F.monotonically_increasing_id()) # This is expected to be small so it's fine to transpose. - return DataFrame(kdf._internal.copy( - sdf=sdf, - index_map=[('__DUMMY__', None)], - column_scols=[scol_for(sdf, col) for col in kdf._internal.data_columns])) \ - ._to_internal_pandas().transpose().iloc[:, 0] + return ( + DataFrame( + kdf._internal.copy( + sdf=sdf, + index_map=[("__DUMMY__", None)], + column_scols=[scol_for(sdf, col) for col in kdf._internal.data_columns], + ) + ) + ._to_internal_pandas() + .transpose() + .iloc[:, 0] + ) # TODO: 'center', 'win_type', 'on', 'axis' parameter should be implemented. def rolling(self, window, min_periods=None): @@ -1680,7 +1746,8 @@ def _resolve_col(kdf, col_like): if kdf is not col_like._kdf: raise ValueError( "Cannot combine the series because it comes from a different dataframe. " - "In order to allow this operation, enable 'compute.ops_on_diff_frames' option.") + "In order to allow this operation, enable 'compute.ops_on_diff_frames' option." + ) return col_like elif isinstance(col_like, tuple): return kdf[col_like] diff --git a/databricks/koalas/groupby.py b/databricks/koalas/groupby.py index da311b7..e9592a0 100644 --- a/databricks/koalas/groupby.py +++ b/databricks/koalas/groupby.py @@ -30,17 +30,30 @@ from pandas.core.dtypes.common import is_datetime64tz_dtype from pyspark.sql import Window, functions as F -from pyspark.sql.types import (FloatType, DoubleType, NumericType, StructField, StructType, - StringType) +from pyspark.sql.types import ( + FloatType, + DoubleType, + NumericType, + StructField, + StructType, + StringType, +) from pyspark.sql.functions import PandasUDFType, pandas_udf, Column from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. from databricks.koalas.typedef import _infer_return_type from databricks.koalas.frame import DataFrame -from databricks.koalas.internal import (_InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME, - SPARK_INDEX_NAME_FORMAT, SPARK_DEFAULT_INDEX_NAME) -from databricks.koalas.missing.groupby import _MissingPandasLikeDataFrameGroupBy, \ - _MissingPandasLikeSeriesGroupBy +from databricks.koalas.internal import ( + _InternalFrame, + HIDDEN_COLUMNS, + NATURAL_ORDER_COLUMN_NAME, + SPARK_INDEX_NAME_FORMAT, + SPARK_DEFAULT_INDEX_NAME, +) +from databricks.koalas.missing.groupby import ( + _MissingPandasLikeDataFrameGroupBy, + _MissingPandasLikeSeriesGroupBy, +) from databricks.koalas.series import Series, _col from databricks.koalas.config import get_option from databricks.koalas.utils import column_labels_level, scol_for, name_like_string @@ -170,21 +183,30 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): func_or_funcs, columns, order = _normalize_keyword_aggregation(kwargs) if not isinstance(func_or_funcs, (str, list)): - if not isinstance(func_or_funcs, dict) or \ - not all(isinstance(key, (str, tuple)) and - (isinstance(value, str) or isinstance(value, list) and - all(isinstance(v, str) for v in value)) - for key, value in func_or_funcs.items()): - raise ValueError("aggs must be a dict mapping from column name (string or tuple) " - "to aggregate functions (string or list of strings).") + if not isinstance(func_or_funcs, dict) or not all( + isinstance(key, (str, tuple)) + and ( + isinstance(value, str) + or isinstance(value, list) + and all(isinstance(v, str) for v in value) + ) + for key, value in func_or_funcs.items() + ): + raise ValueError( + "aggs must be a dict mapping from column name (string or tuple) " + "to aggregate functions (string or list of strings)." + ) else: agg_cols = [col.name for col in self._agg_columns] func_or_funcs = OrderedDict([(col, func_or_funcs) for col in agg_cols]) - index_map = [(SPARK_INDEX_NAME_FORMAT(i), s._internal.column_labels[0]) - for i, s in enumerate(self._groupkeys)] - kdf = DataFrame(GroupBy._spark_groupby( - self._kdf, func_or_funcs, self._groupkeys_scols, index_map)) + index_map = [ + (SPARK_INDEX_NAME_FORMAT(i), s._internal.column_labels[0]) + for i, s in enumerate(self._groupkeys) + ] + kdf = DataFrame( + GroupBy._spark_groupby(self._kdf, func_or_funcs, self._groupkeys_scols, index_map) + ) if not self._as_index: kdf = kdf.reset_index() @@ -197,12 +219,12 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): @staticmethod def _spark_groupby(kdf, func, groupkeys_scols=(), index_map=None): - assert (len(groupkeys_scols) > 0 and index_map is not None) or \ - (len(groupkeys_scols) == 0 and index_map is None) + assert (len(groupkeys_scols) > 0 and index_map is not None) or ( + len(groupkeys_scols) == 0 and index_map is None + ) sdf = kdf._sdf - groupkey_cols = [s.alias(SPARK_INDEX_NAME_FORMAT(i)) - for i, s in enumerate(groupkeys_scols)] + groupkey_cols = [s.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, s in enumerate(groupkeys_scols)] multi_aggs = any(isinstance(v, list) for v in func.values()) reordered = [] data_columns = [] @@ -216,21 +238,28 @@ def _spark_groupby(kdf, func, groupkeys_scols=(), index_map=None): column_labels.append(tuple(list(label) + [aggfunc]) if multi_aggs else label) if aggfunc == "nunique": reordered.append( - F.expr('count(DISTINCT `{0}`) as `{1}`'.format(name, data_col))) + F.expr("count(DISTINCT `{0}`) as `{1}`".format(name, data_col)) + ) # Implement "quartiles" aggregate function for ``describe``. elif aggfunc == "quartiles": reordered.append( - F.expr('percentile_approx(`{0}`, array(0.25, 0.5, 0.75)) as `{1}`'.format( - name, data_col))) + F.expr( + "percentile_approx(`{0}`, array(0.25, 0.5, 0.75)) as `{1}`".format( + name, data_col + ) + ) + ) else: - reordered.append(F.expr('{1}(`{0}`) as `{2}`'.format(name, aggfunc, data_col))) + reordered.append(F.expr("{1}(`{0}`) as `{2}`".format(name, aggfunc, data_col))) sdf = sdf.groupby(*groupkey_cols).agg(*reordered) - return _InternalFrame(sdf=sdf, - column_labels=column_labels, - column_scols=[scol_for(sdf, col) for col in data_columns], - index_map=index_map) + return _InternalFrame( + sdf=sdf, + column_labels=column_labels, + column_scols=[scol_for(sdf, col) for col in data_columns], + index_map=index_map, + ) def count(self): """ @@ -275,8 +304,9 @@ def last(self): databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby """ - return self._reduce_for_stat_function(lambda col: F.last(col, ignorenulls=True), - only_numeric=False) + return self._reduce_for_stat_function( + lambda col: F.last(col, ignorenulls=True), only_numeric=False + ) def max(self): """ @@ -407,8 +437,8 @@ def all(self): 5 False """ return self._reduce_for_stat_function( - lambda col: F.min(F.coalesce(col.cast('boolean'), F.lit(True))), - only_numeric=False) + lambda col: F.min(F.coalesce(col.cast("boolean"), F.lit(True))), only_numeric=False + ) # TODO: skipna should be implemented. def any(self): @@ -449,8 +479,8 @@ def any(self): 5 False """ return self._reduce_for_stat_function( - lambda col: F.max(F.coalesce(col.cast('boolean'), F.lit(False))), - only_numeric=False) + lambda col: F.max(F.coalesce(col.cast("boolean"), F.lit(False))), only_numeric=False + ) # TODO: groupby multiply columns should be implemented. def size(self): @@ -492,20 +522,24 @@ def size(self): Name: count, dtype: int64 """ groupkeys = self._groupkeys - groupkey_cols = [s.alias(SPARK_INDEX_NAME_FORMAT(i)) - for i, s in enumerate(self._groupkeys_scols)] + groupkey_cols = [ + s.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, s in enumerate(self._groupkeys_scols) + ] sdf = self._kdf._sdf sdf = sdf.groupby(*groupkey_cols).count() if (len(self._agg_columns) > 0) and (self._have_agg_columns): name = self._agg_columns[0]._internal.data_columns[0] - sdf = sdf.withColumnRenamed('count', name) + sdf = sdf.withColumnRenamed("count", name) else: - name = 'count' - internal = _InternalFrame(sdf=sdf, - index_map=[(SPARK_INDEX_NAME_FORMAT(i), - s._internal.column_labels[0]) - for i, s in enumerate(groupkeys)], - column_scols=[scol_for(sdf, name)]) + name = "count" + internal = _InternalFrame( + sdf=sdf, + index_map=[ + (SPARK_INDEX_NAME_FORMAT(i), s._internal.column_labels[0]) + for i, s in enumerate(groupkeys) + ], + column_scols=[scol_for(sdf, name)], + ) return _col(DataFrame(internal)) def diff(self, periods=1): @@ -564,7 +598,8 @@ def diff(self, periods=1): Name: a, dtype: float64 """ return self._apply_series_op( - lambda sg: sg._kser._diff(periods, part_cols=sg._groupkeys_scols)) + lambda sg: sg._kser._diff(periods, part_cols=sg._groupkeys_scols) + ) def cummax(self): """ @@ -611,7 +646,8 @@ def cummax(self): """ return self._apply_series_op( - lambda sg: sg._kser._cum(F.max, True, part_cols=sg._groupkeys_scols)) + lambda sg: sg._kser._cum(F.max, True, part_cols=sg._groupkeys_scols) + ) def cummin(self): """ @@ -657,7 +693,8 @@ def cummin(self): Name: B, dtype: float64 """ return self._apply_series_op( - lambda sg: sg._kser._cum(F.min, True, part_cols=sg._groupkeys_scols)) + lambda sg: sg._kser._cum(F.min, True, part_cols=sg._groupkeys_scols) + ) def cumprod(self): """ @@ -704,7 +741,8 @@ def cumprod(self): """ return self._apply_series_op( - lambda sg: sg._kser._cumprod(True, part_cols=sg._groupkeys_scols)) + lambda sg: sg._kser._cumprod(True, part_cols=sg._groupkeys_scols) + ) def cumsum(self): """ @@ -751,7 +789,8 @@ def cumsum(self): """ return self._apply_series_op( - lambda sg: sg._kser._cum(F.sum, True, part_cols=sg._groupkeys_scols)) + lambda sg: sg._kser._cum(F.sum, True, part_cols=sg._groupkeys_scols) + ) def apply(self, func): """ @@ -884,7 +923,8 @@ def apply(self, func): lambda pdf: pdf.groupby(input_groupnames).apply(func), self._groupkeys_scols, return_schema, - retain_index=should_infer_schema) + retain_index=should_infer_schema, + ) if should_infer_schema: # If schema is inferred, we can restore indexes too. @@ -939,7 +979,8 @@ def pandas_filter(pdf): return pdf.groupby(groupby_names).filter(func) sdf = GroupBy._spark_group_map_apply( - self._kdf, pandas_filter, self._groupkeys_scols, data_schema, retain_index=True) + self._kdf, pandas_filter, self._groupkeys_scols, data_schema, retain_index=True + ) return DataFrame(self._kdf._internal.with_new_sdf(sdf)) @staticmethod @@ -968,8 +1009,9 @@ def rename_output(pdf): pdf.columns = [None if label is None else label[0] for label in column_labels] if len(index_names) > 0: - pdf.index.names = [name if name is None or len(name) > 1 else name[0] - for name in index_names] + pdf.index.names = [ + name if name is None or len(name) > 1 else name[0] for name in index_names + ] pdf = func(pdf) @@ -988,16 +1030,21 @@ def rename_output(pdf): index_map = [] if isinstance(index, pd.MultiIndex): if index.names is None: - index_map = [(SPARK_INDEX_NAME_FORMAT(i), None) - for i in range(len(index.levels))] + index_map = [ + (SPARK_INDEX_NAME_FORMAT(i), None) for i in range(len(index.levels)) + ] else: index_map = [ (SPARK_INDEX_NAME_FORMAT(i) if name is None else name, name) - for i, name in enumerate(index.names)] + for i, name in enumerate(index.names) + ] else: - index_map = [( - index.name - if index.name is not None else SPARK_DEFAULT_INDEX_NAME, index.name)] + index_map = [ + ( + index.name if index.name is not None else SPARK_DEFAULT_INDEX_NAME, + index.name, + ) + ] new_index_columns = [index_column for index_column, _ in index_map] new_data_columns = [str(col) for col in columns] @@ -1024,7 +1071,7 @@ def rename_output(pdf): return sdf - def rank(self, method='average', ascending=True): + def rank(self, method="average", ascending=True): """ Provide the rank of values within each group. @@ -1087,7 +1134,8 @@ def rank(self, method='average', ascending=True): """ return self._apply_series_op( - lambda sg: sg._kser._rank(method, ascending, part_cols=sg._groupkeys_scols)) + lambda sg: sg._kser._rank(method, ascending, part_cols=sg._groupkeys_scols) + ) # TODO: add axis parameter def idxmax(self, skipna=True): @@ -1128,10 +1176,11 @@ def idxmax(self, skipna=True): 3 4 4 """ if len(self._kdf._internal.index_names) != 1: - raise ValueError('idxmax only support one-level index now') + raise ValueError("idxmax only support one-level index now") groupkeys = self._groupkeys - groupkey_cols = [s.alias(SPARK_INDEX_NAME_FORMAT(i)) - for i, s in enumerate(self._groupkeys_scols)] + groupkey_cols = [ + s.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, s in enumerate(self._groupkeys_scols) + ] sdf = self._kdf._sdf index = self._kdf._internal.index_columns[0] @@ -1143,21 +1192,25 @@ def idxmax(self, skipna=True): order_column = Column(c._jc.desc_nulls_last()) else: order_column = Column(c._jc.desc_nulls_first()) - window = Window.partitionBy(groupkey_cols) \ - .orderBy(order_column, NATURAL_ORDER_COLUMN_NAME) - sdf = sdf.withColumn(name, - F.when(F.row_number().over(window) == 1, scol_for(sdf, index)) - .otherwise(None)) + window = Window.partitionBy(groupkey_cols).orderBy( + order_column, NATURAL_ORDER_COLUMN_NAME + ) + sdf = sdf.withColumn( + name, F.when(F.row_number().over(window) == 1, scol_for(sdf, index)).otherwise(None) + ) stat_exprs.append(F.max(scol_for(sdf, name)).alias(name)) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) - internal = _InternalFrame(sdf=sdf, - index_map=[(SPARK_INDEX_NAME_FORMAT(i), - s._internal.column_labels[0]) - for i, s in enumerate(groupkeys)], - column_labels=[kser._internal.column_labels[0] - for kser in self._agg_columns], - column_scols=[scol_for(sdf, kser._internal.data_columns[0]) - for kser in self._agg_columns]) + internal = _InternalFrame( + sdf=sdf, + index_map=[ + (SPARK_INDEX_NAME_FORMAT(i), s._internal.column_labels[0]) + for i, s in enumerate(groupkeys) + ], + column_labels=[kser._internal.column_labels[0] for kser in self._agg_columns], + column_scols=[ + scol_for(sdf, kser._internal.data_columns[0]) for kser in self._agg_columns + ], + ) return DataFrame(internal) # TODO: add axis parameter @@ -1199,10 +1252,11 @@ def idxmin(self, skipna=True): 3 4 4 """ if len(self._kdf._internal.index_names) != 1: - raise ValueError('idxmin only support one-level index now') + raise ValueError("idxmin only support one-level index now") groupkeys = self._groupkeys - groupkey_cols = [s.alias(SPARK_INDEX_NAME_FORMAT(i)) - for i, s in enumerate(self._groupkeys_scols)] + groupkey_cols = [ + s.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, s in enumerate(self._groupkeys_scols) + ] sdf = self._kdf._sdf index = self._kdf._internal.index_columns[0] @@ -1214,21 +1268,25 @@ def idxmin(self, skipna=True): order_column = Column(c._jc.asc_nulls_last()) else: order_column = Column(c._jc.asc_nulls_first()) - window = Window.partitionBy(groupkey_cols) \ - .orderBy(order_column, NATURAL_ORDER_COLUMN_NAME) - sdf = sdf.withColumn(name, - F.when(F.row_number().over(window) == 1, scol_for(sdf, index)) - .otherwise(None)) + window = Window.partitionBy(groupkey_cols).orderBy( + order_column, NATURAL_ORDER_COLUMN_NAME + ) + sdf = sdf.withColumn( + name, F.when(F.row_number().over(window) == 1, scol_for(sdf, index)).otherwise(None) + ) stat_exprs.append(F.max(scol_for(sdf, name)).alias(name)) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) - internal = _InternalFrame(sdf=sdf, - index_map=[(SPARK_INDEX_NAME_FORMAT(i), - s._internal.column_labels[0]) - for i, s in enumerate(groupkeys)], - column_labels=[kser._internal.column_labels[0] - for kser in self._agg_columns], - column_scols=[scol_for(sdf, kser._internal.data_columns[0]) - for kser in self._agg_columns]) + internal = _InternalFrame( + sdf=sdf, + index_map=[ + (SPARK_INDEX_NAME_FORMAT(i), s._internal.column_labels[0]) + for i, s in enumerate(groupkeys) + ], + column_labels=[kser._internal.column_labels[0] for kser in self._agg_columns], + column_scols=[ + scol_for(sdf, kser._internal.data_columns[0]) for kser in self._agg_columns + ], + ) return DataFrame(internal) def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None): @@ -1341,7 +1399,7 @@ def bfill(self, limit=None): 2 3.0 1.0 5 3 3.0 1.0 4 """ - return self._fillna(method='bfill', limit=limit) + return self._fillna(method="bfill", limit=limit) backfill = bfill @@ -1392,7 +1450,7 @@ def ffill(self, limit=None): 2 NaN NaN 5 3 3.0 1.0 4 """ - return self._fillna(method='ffill', limit=limit) + return self._fillna(method="ffill", limit=limit) pad = ffill @@ -1442,11 +1500,14 @@ def head(self, n=5): 10 10 Name: b, dtype: int64 """ - tmp_col = '__row_number__' + tmp_col = "__row_number__" sdf = self._kdf._sdf window = Window.partitionBy(self._groupkeys_scols).orderBy(NATURAL_ORDER_COLUMN_NAME) - sdf = sdf.withColumn( - tmp_col, F.row_number().over(window)).filter(F.col(tmp_col) <= n).drop(tmp_col) + sdf = ( + sdf.withColumn(tmp_col, F.row_number().over(window)) + .filter(F.col(tmp_col) <= n) + .drop(tmp_col) + ) internal = self._kdf._internal.with_new_sdf(sdf) return DataFrame(internal) @@ -1509,7 +1570,8 @@ def shift(self, periods=1, fill_value=None): 8 0 """ return self._apply_series_op( - lambda sg: sg._kser._shift(periods, fill_value, part_cols=sg._groupkeys_scols)) + lambda sg: sg._kser._shift(periods, fill_value, part_cols=sg._groupkeys_scols) + ) def transform(self, func): """ @@ -1634,19 +1696,24 @@ def pandas_transform(pdf): return kdf sdf = GroupBy._spark_group_map_apply( - self._kdf, pandas_transform, self._groupkeys_scols, - return_schema, retain_index=True) + self._kdf, pandas_transform, self._groupkeys_scols, return_schema, retain_index=True + ) # If schema is inferred, we can restore indexes too. internal = kdf._internal.with_new_sdf(sdf) else: return_type = _infer_return_type(func).tpe data_columns = self._kdf._internal.data_columns - return_schema = StructType([ - StructField(c, return_type) for c in data_columns if c not in input_groupnames]) + return_schema = StructType( + [StructField(c, return_type) for c in data_columns if c not in input_groupnames] + ) sdf = GroupBy._spark_group_map_apply( - self._kdf, pandas_transform, self._groupkeys_scols, - return_schema, retain_index=False) + self._kdf, + pandas_transform, + self._groupkeys_scols, + return_schema, + retain_index=False, + ) # Otherwise, it loses index. internal = _InternalFrame(sdf=sdf, index_map=None) @@ -1701,9 +1768,10 @@ def nunique(self, dropna=True): if dropna: stat_function = lambda col: F.countDistinct(col) else: - stat_function = lambda col: \ - (F.countDistinct(col) + - F.when(F.count(F.when(col.isNull(), 1).otherwise(None)) >= 1, 1).otherwise(0)) + stat_function = lambda col: ( + F.countDistinct(col) + + F.when(F.count(F.when(col.isNull(), 1).otherwise(None)) >= 1, 1).otherwise(0) + ) return self._reduce_for_stat_function(stat_function, only_numeric=False) def rolling(self, window, min_periods=None): @@ -1756,8 +1824,9 @@ def expanding(self, min_periods=1): return ExpandingGroupby(self, self._groupkeys, min_periods=min_periods) def _reduce_for_stat_function(self, sfun, only_numeric): - groupkey_cols = [s.alias(SPARK_INDEX_NAME_FORMAT(i)) - for i, s in enumerate(self._groupkeys_scols)] + groupkey_cols = [ + s.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, s in enumerate(self._groupkeys_scols) + ] sdf = self._kdf._sdf @@ -1785,13 +1854,16 @@ def _reduce_for_stat_function(self, sfun, only_numeric): else: sdf = sdf.select(*groupkey_cols).distinct() - internal = _InternalFrame(sdf=sdf, - index_map=[(SPARK_INDEX_NAME_FORMAT(i), - s._internal.column_labels[0]) - for i, s in enumerate(self._groupkeys)], - column_labels=column_labels, - column_scols=[scol_for(sdf, col) for col in data_columns], - column_label_names=self._kdf._internal.column_label_names) + internal = _InternalFrame( + sdf=sdf, + index_map=[ + (SPARK_INDEX_NAME_FORMAT(i), s._internal.column_labels[0]) + for i, s in enumerate(self._groupkeys) + ], + column_labels=column_labels, + column_scols=[scol_for(sdf, col) for col in data_columns], + column_label_names=self._kdf._internal.column_label_names, + ) kdf = DataFrame(internal) if not self._as_index: kdf = kdf.reset_index() @@ -1799,9 +1871,13 @@ def _reduce_for_stat_function(self, sfun, only_numeric): class DataFrameGroupBy(GroupBy): - - def __init__(self, kdf: DataFrame, by: List[Series], as_index: bool = True, - agg_columns: List[Union[str, Tuple[str, ...]]] = None): + def __init__( + self, + kdf: DataFrame, + by: List[Series], + as_index: bool = True, + agg_columns: List[Union[str, Tuple[str, ...]]] = None, + ): self._kdf = kdf self._groupkeys = by self._groupkeys_scols = [s._scol for s in self._groupkeys] @@ -1809,8 +1885,11 @@ def __init__(self, kdf: DataFrame, by: List[Series], as_index: bool = True, self._have_agg_columns = True if agg_columns is None: - agg_columns = [label for label in self._kdf._internal.column_labels - if all(not self._kdf[label]._equals(key) for key in self._groupkeys)] + agg_columns = [ + label + for label in self._kdf._internal.column_labels + if all(not self._kdf[label]._equals(key) for key in self._groupkeys) + ] self._have_agg_columns = False self._agg_columns = [kdf[label] for label in agg_columns] self._agg_columns_scols = [s._scol for s in self._agg_columns] @@ -1837,8 +1916,9 @@ def __getitem__(self, item): name = str(i) if len(i) > 1 else i[0] if name in groupkey_names: raise ValueError("cannot insert {}, already exists".format(name)) - return DataFrameGroupBy(self._kdf, self._groupkeys, as_index=self._as_index, - agg_columns=item) + return DataFrameGroupBy( + self._kdf, self._groupkeys, as_index=self._as_index, agg_columns=item + ) def _apply_series_op(self, op): applied = [] @@ -1913,7 +1993,8 @@ def describe(self): for col in self._agg_columns: if isinstance(col.spark_type, StringType): raise NotImplementedError( - "DataFrameGroupBy.describe() doesn't support for string type for now") + "DataFrameGroupBy.describe() doesn't support for string type for now" + ) kdf = self.agg(["count", "mean", "std", "min", "quartiles", "max"]).reset_index() sdf = kdf._sdf @@ -1933,19 +2014,23 @@ def describe(self): data_columns = map(str, column_labels) # Reindex the DataFrame to reflect initial grouping and agg columns. - internal = _InternalFrame(sdf=sdf, - index_map=([(s._internal.data_columns[0], - s._internal.column_labels[0]) - for s in self._groupkeys]), - column_labels=column_labels, - column_scols=[scol_for(sdf, col) for col in data_columns]) + internal = _InternalFrame( + sdf=sdf, + index_map=( + [ + (s._internal.data_columns[0], s._internal.column_labels[0]) + for s in self._groupkeys + ] + ), + column_labels=column_labels, + column_scols=[scol_for(sdf, col) for col in data_columns], + ) # Cast columns to ``"float64"`` to match `pandas.DataFrame.groupby`. return DataFrame(internal).astype("float64") class SeriesGroupBy(GroupBy): - def __init__(self, kser: Series, by: List[Series], as_index: bool = True): self._kser = kser self._groupkeys = by @@ -1959,7 +2044,7 @@ def __init__(self, kser: Series, by: List[Series], as_index: bool = True): self._agg_columns_scols = [F.col(s._internal.data_columns[0]) for s in self._agg_columns] if not as_index: - raise TypeError('as_index=False only valid with DataFrame') + raise TypeError("as_index=False only valid with DataFrame") self._as_index = True self._have_agg_columns = True @@ -2056,18 +2141,24 @@ def nsmallest(self, n=5): Name: b, dtype: int64 """ if len(self._kdf._internal.index_names) > 1: - raise ValueError('nsmallest do not support multi-index now') + raise ValueError("nsmallest do not support multi-index now") sdf = self._kdf._sdf name = self._agg_columns[0]._internal.data_columns[0] window = Window.partitionBy(self._groupkeys_scols).orderBy( - scol_for(sdf, name), NATURAL_ORDER_COLUMN_NAME) - sdf = sdf.withColumn('rank', F.row_number().over(window)).filter(F.col('rank') <= n) - internal = _InternalFrame(sdf=sdf.drop(NATURAL_ORDER_COLUMN_NAME), - index_map=([(s._internal.data_columns[0], - s._internal.column_labels[0]) - for s in self._groupkeys] - + self._kdf._internal.index_map), - column_scols=[scol_for(sdf, name)]) + scol_for(sdf, name), NATURAL_ORDER_COLUMN_NAME + ) + sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n) + internal = _InternalFrame( + sdf=sdf.drop(NATURAL_ORDER_COLUMN_NAME), + index_map=( + [ + (s._internal.data_columns[0], s._internal.column_labels[0]) + for s in self._groupkeys + ] + + self._kdf._internal.index_map + ), + column_scols=[scol_for(sdf, name)], + ) return _col(DataFrame(internal)) # TODO: add keep parameter @@ -2102,18 +2193,24 @@ def nlargest(self, n=5): Name: b, dtype: int64 """ if len(self._kdf._internal.index_names) > 1: - raise ValueError('nlargest do not support multi-index now') + raise ValueError("nlargest do not support multi-index now") sdf = self._kdf._sdf name = self._agg_columns[0]._internal.data_columns[0] - window = Window.partitionBy(self._groupkeys_scols) \ - .orderBy(F.col(name).desc(), NATURAL_ORDER_COLUMN_NAME) - sdf = sdf.withColumn('rank', F.row_number().over(window)).filter(F.col('rank') <= n) - internal = _InternalFrame(sdf=sdf.drop(NATURAL_ORDER_COLUMN_NAME), - index_map=([(s._internal.data_columns[0], - s._internal.column_labels[0]) - for s in self._groupkeys] - + self._kdf._internal.index_map), - column_scols=[scol_for(sdf, name)]) + window = Window.partitionBy(self._groupkeys_scols).orderBy( + F.col(name).desc(), NATURAL_ORDER_COLUMN_NAME + ) + sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n) + internal = _InternalFrame( + sdf=sdf.drop(NATURAL_ORDER_COLUMN_NAME), + index_map=( + [ + (s._internal.data_columns[0], s._internal.column_labels[0]) + for s in self._groupkeys + ] + + self._kdf._internal.index_map + ), + column_scols=[scol_for(sdf, name)], + ) return _col(DataFrame(internal)) # TODO: add bins, normalize parameter @@ -2158,11 +2255,13 @@ def value_counts(self, sort=None, ascending=None, dropna=True): Name: B, dtype: int64 """ groupkeys = self._groupkeys + self._agg_columns - groupkey_cols = [s.alias(SPARK_INDEX_NAME_FORMAT(i)) - for i, s in enumerate(self._groupkeys_scols + self._agg_columns_scols)] + groupkey_cols = [ + s.alias(SPARK_INDEX_NAME_FORMAT(i)) + for i, s in enumerate(self._groupkeys_scols + self._agg_columns_scols) + ] sdf = self._kdf._sdf agg_column = self._agg_columns[0]._internal.data_columns[0] - sdf = sdf.groupby(*groupkey_cols).count().withColumnRenamed('count', agg_column) + sdf = sdf.groupby(*groupkey_cols).count().withColumnRenamed("count", agg_column) if sort: if ascending: @@ -2170,11 +2269,14 @@ def value_counts(self, sort=None, ascending=None, dropna=True): else: sdf = sdf.orderBy(F.col(agg_column).desc()) - internal = _InternalFrame(sdf=sdf, - index_map=[(SPARK_INDEX_NAME_FORMAT(i), - s._internal.column_labels[0]) - for i, s in enumerate(groupkeys)], - column_scols=[scol_for(sdf, agg_column)]) + internal = _InternalFrame( + sdf=sdf, + index_map=[ + (SPARK_INDEX_NAME_FORMAT(i), s._internal.column_labels[0]) + for i, s in enumerate(groupkeys) + ], + column_scols=[scol_for(sdf, agg_column)], + ) return _col(DataFrame(internal)) @@ -2202,10 +2304,7 @@ def _is_multi_agg_with_relabel(**kwargs): """ if not kwargs: return False - return all( - isinstance(v, tuple) and len(v) == 2 - for v in kwargs.values() - ) + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) def _normalize_keyword_aggregation(kwargs): diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index ddfd70c..5508242 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -23,8 +23,16 @@ import pandas as pd import numpy as np -from pandas.api.types import is_list_like, is_interval_dtype, is_bool_dtype, \ - is_categorical_dtype, is_integer_dtype, is_float_dtype, is_numeric_dtype, is_object_dtype +from pandas.api.types import ( + is_list_like, + is_interval_dtype, + is_bool_dtype, + is_categorical_dtype, + is_integer_dtype, + is_float_dtype, + is_numeric_dtype, + is_object_dtype, +) from pandas.io.formats.printing import pprint_thing import pyspark @@ -39,8 +47,15 @@ from databricks.koalas.frame import DataFrame from databricks.koalas.missing.indexes import _MissingPandasLikeIndex, _MissingPandasLikeMultiIndex from databricks.koalas.series import Series, _col -from databricks.koalas.utils import (compare_allow_null, compare_disallow_null, compare_null_first, - compare_null_last, default_session, name_like_string, scol_for) +from databricks.koalas.utils import ( + compare_allow_null, + compare_disallow_null, + compare_null_first, + compare_null_last, + default_session, + name_like_string, + scol_for, +) from databricks.koalas.internal import _InternalFrame, NATURAL_ORDER_COLUMN_NAME @@ -81,8 +96,13 @@ class Index(IndexOpsMixin): Index(['a', 'b', 'c'], dtype='object') """ - def __init__(self, data: Union[DataFrame, list], dtype=None, name=None, - scol: Optional[spark.Column] = None) -> None: + def __init__( + self, + data: Union[DataFrame, list], + dtype=None, + name=None, + scol: Optional[spark.Column] = None, + ) -> None: if isinstance(data, DataFrame): assert dtype is None assert name is None @@ -92,13 +112,15 @@ def __init__(self, data: Union[DataFrame, list], dtype=None, name=None, kdf = DataFrame(index=pd.Index(data=data, dtype=dtype, name=name)) if scol is None: scol = kdf._internal.index_scols[0] - internal = kdf._internal.copy(scol=scol, - column_labels=kdf._internal.index_names, - column_scols=kdf._internal.index_scols, - column_label_names=None) + internal = kdf._internal.copy( + scol=scol, + column_labels=kdf._internal.index_names, + column_scols=kdf._internal.index_scols, + column_label_names=None, + ) IndexOpsMixin.__init__(self, internal, kdf) - def _with_new_scol(self, scol: spark.Column) -> 'Index': + def _with_new_scol(self, scol: spark.Column) -> "Index": """ Copy Koalas Index with the new Spark Column. @@ -122,9 +144,8 @@ def _summary(self, name=None): String with a summarized representation of the index """ head, tail, total_count = self._kdf._sdf.select( - F.first(self._scol), - F.last(self._scol), - F.count(F.expr("*"))).first() + F.first(self._scol), F.last(self._scol), F.count(F.expr("*")) + ).first() if total_count > 0: index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail)) @@ -176,7 +197,7 @@ def shape(self) -> tuple: >>> midx.shape (3,) """ - return len(self._kdf), + return (len(self._kdf),) def identical(self, other): """ @@ -226,9 +247,10 @@ def identical(self, other): other_name = other.names if isinstance(other, MultiIndex) else other.name return (self is other) or ( - type(self) == type(other) and # to support non-index comparison by short-circuiting. - self_name == other_name and - self.equals(other)) + type(self) == type(other) + and self_name == other_name # to support non-index comparison by short-circuiting. + and self.equals(other) + ) def equals(self, other): """ @@ -281,9 +303,12 @@ def equals(self, other): # some exceptions when 'compute.ops_on_diff_frames' is enabled. # Working around for now via using frame. return (self is other) or ( - type(self) == type(other) and - (self.to_series().rename("self").to_frame().reset_index()['self'] == - other.to_series().rename("other").to_frame().reset_index()['other']).all()) + type(self) == type(other) + and ( + self.to_series().rename("self").to_frame().reset_index()["self"] + == other.to_series().rename("other").to_frame().reset_index()["other"] + ).all() + ) def transpose(self): """ @@ -336,7 +361,10 @@ def to_pandas(self) -> pd.Index: internal = self._kdf._internal.copy( sdf=sdf, index_map=[(sdf.schema[0].name, self._kdf._internal.index_names[0])], - column_labels=[], column_scols=[], column_label_names=None) + column_labels=[], + column_scols=[], + column_label_names=None, + ) return DataFrame(internal)._to_internal_pandas().index toPandas = to_pandas @@ -415,13 +443,15 @@ def name(self, name: Union[str, Tuple[str, ...]]) -> None: @property def names(self) -> List[Union[str, Tuple[str, ...]]]: """Return names of the Index.""" - return [name if name is None or len(name) > 1 else name[0] - for name in self._internal.index_names] + return [ + name if name is None or len(name) > 1 else name[0] + for name in self._internal.index_names + ] @names.setter def names(self, names: List[Union[str, Tuple[str, ...]]]) -> None: if not is_list_like(names): - raise ValueError('Names must be a list-like') + raise ValueError("Names must be a list-like") self.rename(names, inplace=True) @property @@ -441,8 +471,11 @@ def nlevels(self) -> int: """ return len(self._kdf._internal.index_columns) - def rename(self, name: Union[str, Tuple[str, ...], List[Union[str, Tuple[str, ...]]]], - inplace: bool = False): + def rename( + self, + name: Union[str, Tuple[str, ...], List[Union[str, Tuple[str, ...]]]], + inplace: bool = False, + ): """ Alter Index or MultiIndex name. Able to set new names without level. Defaults to returning new index. @@ -518,11 +551,14 @@ def _verify_for_rename(self, name): return [(name,)] elif is_list_like(name): if len(self._internal.index_map) != len(name): - raise ValueError('Length of new names must be {}, got {}' - .format(len(self._internal.index_map), len(name))) + raise ValueError( + "Length of new names must be {}, got {}".format( + len(self._internal.index_map), len(name) + ) + ) return [n if n is None or isinstance(n, tuple) else (n,) for n in name] else: - raise TypeError('name must be a hashable type') + raise TypeError("name must be a hashable type") # TODO: add downcast parameter for fillna function def fillna(self, value): @@ -614,10 +650,10 @@ def to_series(self, name: Union[str, Tuple[str, ...]] = None) -> Series: if name is not None: scol = scol.alias(name_like_string(name)) column_labels = [None] if len(kdf._internal.index_map) > 1 else kdf._internal.index_names - return Series(kdf._internal.copy(scol=scol, - column_labels=column_labels, - column_label_names=None), - anchor=kdf) + return Series( + kdf._internal.copy(scol=scol, column_labels=column_labels, column_label_names=None), + anchor=kdf, + ) def to_frame(self, index=True, name=None) -> DataFrame: """ @@ -670,11 +706,11 @@ def to_frame(self, index=True, name=None) -> DataFrame: """ if name is None: if self._internal.index_names[0] is None: - name = ('0',) + name = ("0",) else: name = self._internal.index_names[0] elif isinstance(name, str): - name = (name,) + name = (name,) scol = self._scol.alias(name_like_string(name)) sdf = self._internal.sdf.select(scol, NATURAL_ORDER_COLUMN_NAME) @@ -684,10 +720,12 @@ def to_frame(self, index=True, name=None) -> DataFrame: else: index_map = None # type: ignore - internal = _InternalFrame(sdf=sdf, - index_map=index_map, - column_labels=[name], - column_scols=[scol_for(sdf, name_like_string(name))]) + internal = _InternalFrame( + sdf=sdf, + index_map=index_map, + column_labels=[name], + column_scols=[scol_for(sdf, name_like_string(name))], + ) return DataFrame(internal) def is_boolean(self): @@ -883,14 +921,10 @@ def _validate_index_level(self, level): " %d is not a valid level number" % (level,) ) elif level > 0: - raise IndexError( - "Too many levels:" " Index has only 1 level, not %d" % (level + 1) - ) + raise IndexError("Too many levels:" " Index has only 1 level, not %d" % (level + 1)) elif level != self.name: raise KeyError( - "Requested level ({}) does not match index name ({})".format( - level, self.name - ) + "Requested level ({}) does not match index name ({})".format(level, self.name) ) def copy(self, name=None): @@ -980,20 +1014,18 @@ def symmetric_difference(self, other, result_name=None, sort=None): """ if type(self) != type(other): raise NotImplementedError( - "Doesn't support symmetric_difference between Index & MultiIndex for now") + "Doesn't support symmetric_difference between Index & MultiIndex for now" + ) sdf_self = self._kdf._sdf.select(self._internal.index_scols) sdf_other = other._kdf._sdf.select(other._internal.index_scols) - sdf_symdiff = sdf_self.union(sdf_other) \ - .subtract(sdf_self.intersect(sdf_other)) + sdf_symdiff = sdf_self.union(sdf_other).subtract(sdf_self.intersect(sdf_other)) if sort: sdf_symdiff = sdf_symdiff.sort(self._internal.index_scols) - internal = _InternalFrame( - sdf=sdf_symdiff, - index_map=self._internal.index_map) + internal = _InternalFrame(sdf=sdf_symdiff, index_map=self._internal.index_map) result = Index(DataFrame(internal)) if result_name: @@ -1066,17 +1098,15 @@ def sort_values(self, ascending=True): sdf = sdf.orderBy(self._internal.index_scols, ascending=ascending) internal = _InternalFrame( - sdf=sdf.select(self._internal.index_scols), - index_map=self._internal.index_map) + sdf=sdf.select(self._internal.index_scols), index_map=self._internal.index_map + ) return DataFrame(internal).index def sort(self, *args, **kwargs): """ Use sort_values instead. """ - raise TypeError( - "cannot sort an Index object in-place, use sort_values instead" - ) + raise TypeError("cannot sort an Index object in-place, use sort_values instead") def min(self): """ @@ -1190,7 +1220,8 @@ def append(self, other): """ if type(self) is not type(other): raise NotImplementedError( - "append() between Index & MultiIndex currently is not supported") + "append() between Index & MultiIndex currently is not supported" + ) sdf_self = self._internal.sdf.select(self._internal.index_scols) sdf_other = other._internal.sdf.select(other._internal.index_scols) @@ -1202,9 +1233,7 @@ def append(self, other): else: index_map = [(idx_col, None) for idx_col in self._internal.index_columns] - internal = _InternalFrame( - sdf=sdf_appended, - index_map=index_map) + internal = _InternalFrame(sdf=sdf_appended, index_map=index_map) return DataFrame(internal).index @@ -1390,7 +1419,7 @@ def __repr__(self): repr_string = repr(pindex[:max_display_count]) if pindex_length > max_display_count: - footer = '\nShowing only the first {}'.format(max_display_count) + footer = "\nShowing only the first {}".format(max_display_count) return repr_string + footer return repr_string @@ -1437,13 +1466,13 @@ def __init__(self, kdf: DataFrame): assert len(kdf._internal._index_map) > 1 scol = F.struct(kdf._internal.index_scols) data_columns = kdf._sdf.select(scol).columns - internal = kdf._internal.copy(scol=scol, - column_labels=[(col, None) for col in data_columns], - column_label_names=None) + internal = kdf._internal.copy( + scol=scol, column_labels=[(col, None) for col in data_columns], column_label_names=None + ) IndexOpsMixin.__init__(self, internal, kdf) def _with_new_scol(self, scol: spark.Column): - raise NotImplementedError('Not supported for type MultiIndex') + raise NotImplementedError("Not supported for type MultiIndex") def any(self, *args, **kwargs): raise TypeError("cannot perform any with this index type: MultiIndex") @@ -1481,8 +1510,9 @@ def from_tuples(tuples, sortorder=None, names=None): (2, 'blue')], names=['number', 'color']) """ - return DataFrame(index=pd.MultiIndex.from_tuples( - tuples=tuples, sortorder=sortorder, names=names)).index + return DataFrame( + index=pd.MultiIndex.from_tuples(tuples=tuples, sortorder=sortorder, names=names) + ).index @staticmethod def from_arrays(arrays, sortorder=None, names=None): @@ -1514,9 +1544,9 @@ def from_arrays(arrays, sortorder=None, names=None): (2, 'blue')], names=['number', 'color']) """ - return DataFrame(index=pd.MultiIndex.from_arrays( - arrays=arrays, sortorder=sortorder, names=names - )).index + return DataFrame( + index=pd.MultiIndex.from_arrays(arrays=arrays, sortorder=sortorder, names=names) + ).index @staticmethod def from_product(iterables, sortorder=None, names=None): @@ -1556,26 +1586,29 @@ def from_product(iterables, sortorder=None, names=None): (2, 'purple')], names=['number', 'color']) """ - return DataFrame(index=pd.MultiIndex.from_product( - iterables=iterables, sortorder=sortorder, names=names - )).index + return DataFrame( + index=pd.MultiIndex.from_product(iterables=iterables, sortorder=sortorder, names=names) + ).index @property def name(self) -> str: - raise PandasNotImplementedError(class_name='pd.MultiIndex', property_name='name') + raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name") @name.setter def name(self, name: str) -> None: - raise PandasNotImplementedError(class_name='pd.MultiIndex', property_name='name') + raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name") def _verify_for_rename(self, name): if is_list_like(name): if len(self._internal.index_map) != len(name): - raise ValueError('Length of new names must be {}, got {}' - .format(len(self._internal.index_map), len(name))) + raise ValueError( + "Length of new names must be {}, got {}".format( + len(self._internal.index_map), len(name) + ) + ) return [n if n is None or isinstance(n, tuple) else (n,) for n in name] else: - raise TypeError('Must pass list-like as `names`.') + raise TypeError("Must pass list-like as `names`.") def swaplevel(self, i=-2, j=-1): """ @@ -1616,15 +1649,17 @@ def swaplevel(self, i=-2, j=-1): """ for index in (i, j): if not isinstance(index, int) and index not in self.names: - raise KeyError('Level %s not found' % index) + raise KeyError("Level %s not found" % index) i = i if isinstance(i, int) else self.names.index(i) j = j if isinstance(j, int) else self.names.index(j) for index in (i, j): if index >= len(self.names) or index < -len(self.names): - raise IndexError("Too many levels: Index has only %s levels, " - "%s is not a valid level number" % (len(self.names), index)) + raise IndexError( + "Too many levels: Index has only %s levels, " + "%s is not a valid level number" % (len(self.names), index) + ) index_map = self._internal.index_map.copy() index_map[i], index_map[j], = index_map[j], index_map[i] @@ -1669,14 +1704,16 @@ def _is_monotonic(self): compare = compare_null_first else: compare = compare_null_last - cond = F.when(left.eqNullSafe(right), cond) \ - .otherwise(compare(left, right, spark.Column.__gt__)) + cond = F.when(left.eqNullSafe(right), cond).otherwise( + compare(left, right, spark.Column.__gt__) + ) cond = prev.isNull() | cond internal = _InternalFrame( sdf=self._internal.sdf.select(self._internal.index_scols + [cond]), - index_map=self._internal.index_map) + index_map=self._internal.index_map, + ) return _col(DataFrame(internal)) @@ -1697,14 +1734,16 @@ def _is_monotonic_decreasing(self): compare = compare_null_last else: compare = compare_null_first - cond = F.when(left.eqNullSafe(right), cond) \ - .otherwise(compare(left, right, spark.Column.__lt__)) + cond = F.when(left.eqNullSafe(right), cond).otherwise( + compare(left, right, spark.Column.__lt__) + ) cond = prev.isNull() | cond internal = _InternalFrame( sdf=self._internal.sdf.select(self._internal.index_scols + [cond]), - index_map=self._internal.index_map) + index_map=self._internal.index_map, + ) return _col(DataFrame(internal)) @@ -1768,8 +1807,10 @@ def to_frame(self, index=True, name=None) -> DataFrame: blue 2 blue """ if name is None: - name = [name if name is not None else (str(i),) - for i, name in enumerate(self._internal.index_names)] + name = [ + name if name is not None else (str(i),) + for i, name in enumerate(self._internal.index_names) + ] elif is_list_like(name): if len(name) != len(self._internal.index_map): raise ValueError("'name' should have same length as number of levels on index.") @@ -1777,21 +1818,27 @@ def to_frame(self, index=True, name=None) -> DataFrame: else: raise TypeError("'name' must be a list / sequence of column names.") - sdf = self._internal.sdf.select([scol.alias(name_like_string(label)) - for scol, label in zip(self._internal.index_scols, name)] - + [NATURAL_ORDER_COLUMN_NAME]) + sdf = self._internal.sdf.select( + [ + scol.alias(name_like_string(label)) + for scol, label in zip(self._internal.index_scols, name) + ] + + [NATURAL_ORDER_COLUMN_NAME] + ) if index: - index_map = [(name_like_string(label), n) - for label, n in zip(name, self._internal.index_names)] + index_map = [ + (name_like_string(label), n) for label, n in zip(name, self._internal.index_names) + ] else: index_map = None # type: ignore - internal = _InternalFrame(sdf=sdf, - index_map=index_map, - column_labels=name, - column_scols=[scol_for(sdf, name_like_string(label)) - for label in name]) + internal = _InternalFrame( + sdf=sdf, + index_map=index_map, + column_labels=name, + column_scols=[scol_for(sdf, name_like_string(label)) for label in name], + ) return DataFrame(internal) def to_pandas(self) -> pd.MultiIndex: @@ -1821,7 +1868,7 @@ def to_pandas(self) -> pd.MultiIndex: toPandas = to_pandas def unique(self, level=None): - raise PandasNotImplementedError(class_name='MultiIndex', method_name='unique') + raise PandasNotImplementedError(class_name="MultiIndex", method_name="unique") def nunique(self, dropna=True): raise NotImplementedError("isna is not defined for MultiIndex") @@ -1902,20 +1949,18 @@ def symmetric_difference(self, other, result_name=None, sort=None): """ if type(self) != type(other): raise NotImplementedError( - "Doesn't support symmetric_difference between Index & MultiIndex for now") + "Doesn't support symmetric_difference between Index & MultiIndex for now" + ) sdf_self = self._kdf._sdf.select(self._internal.index_scols) sdf_other = other._kdf._sdf.select(other._internal.index_scols) - sdf_symdiff = sdf_self.union(sdf_other) \ - .subtract(sdf_self.intersect(sdf_other)) + sdf_symdiff = sdf_self.union(sdf_other).subtract(sdf_self.intersect(sdf_other)) if sort: sdf_symdiff = sdf_symdiff.sort(self._internal.index_scols) - internal = _InternalFrame( - sdf=sdf_symdiff, - index_map=self._internal.index_map) + internal = _InternalFrame(sdf=sdf_symdiff, index_map=self._internal.index_map) result = MultiIndex(DataFrame(internal)) if result_name: @@ -1963,18 +2008,24 @@ def drop(self, codes, level=None): else: scol = index_scols[level] if isinstance(level, int) else sdf[level] sdf = sdf[~scol.isin(codes)] - return MultiIndex(DataFrame(_InternalFrame(sdf=sdf, - index_map=self._kdf._internal.index_map))) + return MultiIndex( + DataFrame(_InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map)) + ) def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): - if LooseVersion(pyspark.__version__) < LooseVersion("2.4") and \ - default_session().conf.get("spark.sql.execution.arrow.enabled") == "true" and \ - isinstance(self, MultiIndex): - raise RuntimeError("if you're using pyspark < 2.4, set conf " - "'spark.sql.execution.arrow.enabled' to 'false' " - "for using this function with MultiIndex") + if ( + LooseVersion(pyspark.__version__) < LooseVersion("2.4") + and default_session().conf.get("spark.sql.execution.arrow.enabled") == "true" + and isinstance(self, MultiIndex) + ): + raise RuntimeError( + "if you're using pyspark < 2.4, set conf " + "'spark.sql.execution.arrow.enabled' to 'false' " + "for using this function with MultiIndex" + ) return super(MultiIndex, self).value_counts( - normalize=normalize, sort=sort, ascending=ascending, bins=bins, dropna=dropna) + normalize=normalize, sort=sort, ascending=ascending, bins=bins, dropna=dropna + ) value_counts.__doc__ = IndexOpsMixin.value_counts.__doc__ @@ -2026,7 +2077,7 @@ def __repr__(self): repr_string = repr(pindex[:max_display_count]) if pindex_length > max_display_count: - footer = '\nShowing only the first {}'.format(max_display_count) + footer = "\nShowing only the first {}".format(max_display_count) return repr_string + footer return repr_string diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index 89e1c15..0c6e130 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -31,22 +31,25 @@ class _IndexerLike(object): - def __init__(self, kdf_or_kser): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series - assert isinstance(kdf_or_kser, (DataFrame, Series)), \ - 'unexpected argument type: {}'.format(type(kdf_or_kser)) + + assert isinstance(kdf_or_kser, (DataFrame, Series)), "unexpected argument type: {}".format( + type(kdf_or_kser) + ) self._kdf_or_kser = kdf_or_kser @property def _is_df(self): from databricks.koalas.frame import DataFrame + return isinstance(self._kdf_or_kser, DataFrame) @property def _is_series(self): from databricks.koalas.series import Series + return isinstance(self._kdf_or_kser, Series) @property @@ -106,29 +109,38 @@ def __getitem__(self, key): if len(self._internal.index_map) == 1: if is_list_like(row_sel): - raise ValueError( - 'At based indexing on a single index can only have a single value') + raise ValueError("At based indexing on a single index can only have a single value") row_sel = (row_sel,) elif not isinstance(row_sel, tuple): - raise ValueError( - 'At based indexing on multi-index can only have tuple values') - if not (isinstance(col_sel, str) or - (isinstance(col_sel, tuple) and all(isinstance(col, str) for col in col_sel))): - raise ValueError('At based indexing on multi-index can only have tuple values') + raise ValueError("At based indexing on multi-index can only have tuple values") + if not ( + isinstance(col_sel, str) + or (isinstance(col_sel, tuple) and all(isinstance(col, str) for col in col_sel)) + ): + raise ValueError("At based indexing on multi-index can only have tuple values") if isinstance(col_sel, str): col_sel = (col_sel,) - cond = reduce(lambda x, y: x & y, - [scol == row for scol, row in zip(self._internal.index_scols, row_sel)]) - pdf = self._internal.sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) \ - .select(self._internal.scol_for(col_sel)).toPandas() + cond = reduce( + lambda x, y: x & y, + [scol == row for scol, row in zip(self._internal.index_scols, row_sel)], + ) + pdf = ( + self._internal.sdf.drop(NATURAL_ORDER_COLUMN_NAME) + .filter(cond) + .select(self._internal.scol_for(col_sel)) + .toPandas() + ) if len(pdf) < 1: raise KeyError(name_like_string(row_sel)) values = pdf.iloc[:, 0].values - return values if (len(row_sel) < len(self._internal.index_map) - or len(values) > 1) else values[0] + return ( + values + if (len(row_sel) < len(self._internal.index_map) or len(values) > 1) + else values[0] + ) class iAtIndexer(_IndexerLike): @@ -171,26 +183,27 @@ class iAtIndexer(_IndexerLike): >>> kser.iat[1] 2 """ + def __getitem__(self, key): if self._is_df: if not isinstance(key, tuple) or len(key) != 2: raise TypeError( - "Use DataFrame.iat like .iat[row_integer_position, column_integer_position]") + "Use DataFrame.iat like .iat[row_integer_position, column_integer_position]" + ) row_sel, col_sel = key if not isinstance(row_sel, int) or not isinstance(col_sel, int): - raise ValueError('iAt based indexing can only have integer indexers') + raise ValueError("iAt based indexing can only have integer indexers") return self._kdf_or_kser.iloc[row_sel, col_sel] else: assert self._is_series, type(self._kdf_or_kser) if not isinstance(key, int) and len(key) != 1: raise TypeError("Use Series.iat like .iat[row_integer_position]") if not isinstance(key, int): - raise ValueError('iAt based indexing can only have integer indexers') + raise ValueError("iAt based indexing can only have integer indexers") return self._kdf_or_kser.iloc[key] class _LocIndexerLike(_IndexerLike): - def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series @@ -198,8 +211,8 @@ def __getitem__(self, key): if self._is_series: if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() - kdf['__temp_col__'] = key - return type(self)(kdf[self._kdf_or_kser.name])[kdf['__temp_col__']] + kdf["__temp_col__"] = key + return type(self)(kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]] cond, limit, remaining_index = self._select_rows(key) if cond is None and limit is None: @@ -220,17 +233,19 @@ def __getitem__(self, key): if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() - kdf['__temp_col__'] = rows_sel - return type(self)(kdf)[kdf['__temp_col__'], - cols_sel][list(self._kdf_or_kser.columns)] + kdf["__temp_col__"] = rows_sel + return type(self)(kdf)[kdf["__temp_col__"], cols_sel][ + list(self._kdf_or_kser.columns) + ] cond, limit, remaining_index = self._select_rows(rows_sel) column_labels, column_scols, returns_series = self._select_cols(cols_sel) if cond is None and limit is None and returns_series: - return Series(self._internal.copy(scol=column_scols[0], - column_labels=[column_labels[0]]), - anchor=self._kdf_or_kser) + return Series( + self._internal.copy(scol=column_scols[0], column_labels=[column_labels[0]]), + anchor=self._kdf_or_kser, + ) if remaining_index is not None: index_scols = self._internal.index_scols[-remaining_index:] @@ -258,18 +273,20 @@ def __getitem__(self, key): sdf = sdf.select(index_scols + column_scols) except AnalysisException: - raise KeyError('[{}] don\'t exist in columns' - .format([col._jc.toString() for col in column_scols])) - - internal = _InternalFrame(sdf=sdf, - index_map=index_map, - column_labels=column_labels, - column_label_names=column_label_names) + raise KeyError( + "[{}] don't exist in columns".format([col._jc.toString() for col in column_scols]) + ) + + internal = _InternalFrame( + sdf=sdf, + index_map=index_map, + column_labels=column_labels, + column_label_names=column_label_names, + ) kdf = DataFrame(internal) if returns_series: - kdf_or_kser = Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]), - anchor=kdf) + kdf_or_kser = Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: kdf_or_kser = kdf @@ -473,7 +490,8 @@ def _raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", - spark_target_function="select, where") + spark_target_function="select, where", + ) def _select_rows(self, rows_sel): from databricks.koalas.series import Series @@ -500,9 +518,12 @@ def _select_rows(self, rows_sel): # to keep natural order. start_and_stop = ( sdf.select(index_column._scol, NATURAL_ORDER_COLUMN_NAME) - .where((index_column._scol == F.lit(start).cast(index_data_type)) - | (index_column._scol == F.lit(stop).cast(index_data_type))) - .collect()) + .where( + (index_column._scol == F.lit(start).cast(index_data_type)) + | (index_column._scol == F.lit(stop).cast(index_data_type)) + ) + .collect() + ) start = [row[1] for row in start_and_stop if row[0] == start] start = start[0] if len(start) > 0 else None @@ -518,13 +539,20 @@ def _select_rows(self, rows_sel): # if index order is not monotonic increasing or decreasing # and specified values don't exist in index, raise KeyError - if ((start is None and rows_sel.start is not None) - or (stop is None and rows_sel.stop is not None)): - inc, dec = sdf.select( - index_column._is_monotonic()._scol.alias('__increasing__'), - index_column._is_monotonic_decreasing()._scol.alias('__decreasing__')) \ - .select(F.min(F.coalesce('__increasing__', F.lit(True))), - F.min(F.coalesce('__decreasing__', F.lit(True)))).first() + if (start is None and rows_sel.start is not None) or ( + stop is None and rows_sel.stop is not None + ): + inc, dec = ( + sdf.select( + index_column._is_monotonic()._scol.alias("__increasing__"), + index_column._is_monotonic_decreasing()._scol.alias("__decreasing__"), + ) + .select( + F.min(F.coalesce("__increasing__", F.lit(True))), + F.min(F.coalesce("__decreasing__", F.lit(True))), + ) + .first() + ) if start is None and rows_sel.start is not None: start = rows_sel.start if inc is not False: @@ -554,23 +582,31 @@ def _select_rows(self, rows_sel): index_column = self._kdf_or_kser.index.to_series() index_data_type = index_column.spark_type if len(rows_sel) == 1: - return (index_column._scol == F.lit(rows_sel[0]).cast(index_data_type), - None, None) + return ( + index_column._scol == F.lit(rows_sel[0]).cast(index_data_type), + None, + None, + ) else: - return (index_column._scol.isin([F.lit(r).cast(index_data_type) - for r in rows_sel]), - None, None) + return ( + index_column._scol.isin([F.lit(r).cast(index_data_type) for r in rows_sel]), + None, + None, + ) else: LocIndexer._raiseNotImplemented("Cannot select with MultiIndex with Spark.") else: if not isinstance(rows_sel, tuple): rows_sel = (rows_sel,) if len(rows_sel) > len(self._internal.index_map): - raise SparkPandasIndexingError('Too many indexers') + raise SparkPandasIndexingError("Too many indexers") rows = [scol == value for scol, value in zip(self._internal.index_scols, rows_sel)] - return (reduce(lambda x, y: x & y, rows), - None, len(self._internal.index_map) - len(rows_sel)) + return ( + reduce(lambda x, y: x & y, rows), + None, + len(self._internal.index_map) - len(rows_sel), + ) def _get_from_multiindex_column(self, key, labels=None): """ Select columns from multi-index columns. @@ -586,7 +622,7 @@ def _get_from_multiindex_column(self, key, labels=None): if len(labels) == 0: raise KeyError(k) - if all(len(lbl) > 0 and lbl[0] == '' for _, lbl in labels): + if all(len(lbl) > 0 and lbl[0] == "" for _, lbl in labels): # If the head is '', drill down recursively. labels = [(label, tuple([str(key), *lbl[1:]])) for i, (label, lbl) in enumerate(labels)] return self._get_from_multiindex_column((str(key),), labels) @@ -614,7 +650,8 @@ def _select_cols(self, cols_sel): cols_sel = None else: raise LocIndexer._raiseNotImplemented( - "Can only select columns either by name or reference or all") + "Can only select columns either by name or reference or all" + ) elif isinstance(cols_sel, (Series, spark.Column)): returns_series = True cols_sel = [cols_sel] @@ -632,14 +669,15 @@ def _select_cols(self, cols_sel): elif all(isinstance(key, spark.Column) for key in cols_sel): column_labels = [(self._internal.sdf.select(col).columns[0],) for col in cols_sel] column_scols = cols_sel - elif (any(isinstance(key, str) for key in cols_sel) - and any(isinstance(key, tuple) for key in cols_sel)): - raise TypeError('Expected tuple, got str') + elif any(isinstance(key, str) for key in cols_sel) and any( + isinstance(key, tuple) for key in cols_sel + ): + raise TypeError("Expected tuple, got str") else: if all(isinstance(key, tuple) for key in cols_sel): level = self._internal.column_labels_level if any(len(key) != level for key in cols_sel): - raise ValueError('All the key level should be the same as column index level.') + raise ValueError("All the key level should be the same as column index level.") column_labels = [] column_scols = [] @@ -663,13 +701,15 @@ def __setitem__(self, key, value): raise SparkPandasNotImplementedError( description="Can only assign value to dataframes", pandas_function=".loc[..., ...] = ...", - spark_target_function="withColumn, select") + spark_target_function="withColumn, select", + ) if (not isinstance(key, tuple)) or (len(key) != 2): raise SparkPandasNotImplementedError( description="Only accepts pairs of candidates", pandas_function=".loc[..., ...] = ...", - spark_target_function="withColumn, select") + spark_target_function="withColumn, select", + ) rows_sel, cols_sel = key @@ -684,11 +724,15 @@ def __setitem__(self, key, value): # reserved for Koalas' internal columns. kdf["__indexing_temp_col__"] = value new_col = kdf["__indexing_temp_col__"]._scol - kdf[col_sel] = Series(kdf[col_sel]._internal.copy( - scol=F.when( - kdf._internal.index_scols[0].isin(rows_sel), new_col - ).otherwise(kdf[col_sel]._scol)), anchor=kdf) - kdf = kdf.drop(labels=['__indexing_temp_col__']) + kdf[col_sel] = Series( + kdf[col_sel]._internal.copy( + scol=F.when( + kdf._internal.index_scols[0].isin(rows_sel), new_col + ).otherwise(kdf[col_sel]._scol) + ), + anchor=kdf, + ) + kdf = kdf.drop(labels=["__indexing_temp_col__"]) self._kdf_or_kser._internal = kdf._internal.copy() else: @@ -696,7 +740,8 @@ def __setitem__(self, key, value): description="""Can only assign value to the whole dataframe, the row index has to be `slice(None)` or `:`""", pandas_function=".loc[..., ...] = ...", - spark_target_function="withColumn, select") + spark_target_function="withColumn, select", + ) if not isinstance(cols_sel, (str, list)): raise ValueError("""only column names or list of column names can be assigned""") @@ -835,13 +880,15 @@ def _raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".iloc[..., ...]", - spark_target_function="select, where") + spark_target_function="select, where", + ) @lazy_property def _internal(self): internal = super(iLocIndexer, self)._internal - sdf = _InternalFrame.attach_distributed_sequence_column(internal.sdf, - column_name=self._sequence_col) + sdf = _InternalFrame.attach_distributed_sequence_column( + internal.sdf, column_name=self._sequence_col + ) return internal.with_new_sdf(sdf.orderBy(NATURAL_ORDER_COLUMN_NAME)) @lazy_property @@ -852,7 +899,7 @@ def _select_rows(self, rows_sel): from databricks.koalas.indexes import Index if isinstance(rows_sel, tuple) and len(rows_sel) > 1: - raise SparkPandasIndexingError('Too many indexers') + raise SparkPandasIndexingError("Too many indexers") elif isinstance(rows_sel, Index): assert isinstance(rows_sel.spark_type, BooleanType), rows_sel.spark_type return rows_sel._scol, None, None @@ -863,16 +910,21 @@ def _select_rows(self, rows_sel): elif (rows_sel.start is not None) or (rows_sel.step is not None): iLocIndexer._raiseNotImplemented("Cannot use start or step with Spark.") elif not isinstance(rows_sel.stop, int): - raise TypeError("cannot do slice indexing with these indexers [{}] of {}" - .format(rows_sel.stop, type(rows_sel.stop))) + raise TypeError( + "cannot do slice indexing with these indexers [{}] of {}".format( + rows_sel.stop, type(rows_sel.stop) + ) + ) else: return None, rows_sel.stop, None elif isinstance(rows_sel, int): sdf = self._internal.sdf return (sdf[self._sequence_col] == rows_sel), None, 0 else: - iLocIndexer._raiseNotImplemented(".iloc requires numeric slice or conditional " - "boolean Index, got {}".format(type(rows_sel))) + iLocIndexer._raiseNotImplemented( + ".iloc requires numeric slice or conditional " + "boolean Index, got {}".format(type(rows_sel)) + ) def _select_cols(self, cols_sel): from databricks.koalas.series import Series @@ -892,15 +944,25 @@ def _select_cols(self, cols_sel): column_labels = self._internal.column_labels column_scols = self._internal.column_scols elif isinstance(cols_sel, slice): - if all(s is None or isinstance(s, int) - for s in (cols_sel.start, cols_sel.stop, cols_sel.step)): + if all( + s is None or isinstance(s, int) + for s in (cols_sel.start, cols_sel.stop, cols_sel.step) + ): column_labels = self._internal.column_labels[cols_sel] column_scols = self._internal.column_scols[cols_sel] else: - not_none = cols_sel.start if cols_sel.start is not None \ - else cols_sel.stop if cols_sel.stop is not None else cols_sel.step - raise TypeError('cannot do slice indexing with these indexers {} of {}' - .format(not_none, type(not_none))) + not_none = ( + cols_sel.start + if cols_sel.start is not None + else cols_sel.stop + if cols_sel.stop is not None + else cols_sel.step + ) + raise TypeError( + "cannot do slice indexing with these indexers {} of {}".format( + not_none, type(not_none) + ) + ) elif is_list_like(cols_sel): if all(isinstance(s, bool) for s in cols_sel): cols_sel = [i for i, s in enumerate(cols_sel) if s] @@ -908,9 +970,11 @@ def _select_cols(self, cols_sel): column_labels = [self._internal.column_labels[s] for s in cols_sel] column_scols = [self._internal.column_scols[s] for s in cols_sel] else: - raise TypeError('cannot perform reduce with flexible type') + raise TypeError("cannot perform reduce with flexible type") else: - raise ValueError("Location based indexing can only have [integer, integer slice, " - "listlike of integers, boolean array] types, got {}".format(cols_sel)) + raise ValueError( + "Location based indexing can only have [integer, integer slice, " + "listlike of integers, boolean array] types, got {}".format(cols_sel) + ) return column_labels, column_scols, returns_series diff --git a/databricks/koalas/internal.py b/databricks/koalas/internal.py index f69ae31..5843cc6 100644 --- a/databricks/koalas/internal.py +++ b/databricks/koalas/internal.py @@ -29,19 +29,26 @@ from pyspark.sql import functions as F, Window from pyspark.sql.functions import PandasUDFType, pandas_udf from pyspark.sql.types import BooleanType, DataType, StructField, StructType, LongType + try: from pyspark.sql.types import to_arrow_type except ImportError: from pyspark.sql.pandas.types import to_arrow_type from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. + if TYPE_CHECKING: # This is required in old Python 3.5 to prevent circular reference. from databricks.koalas.series import Series from databricks.koalas.config import get_option from databricks.koalas.typedef import infer_pd_series_spark_type, spark_type_to_pandas_dtype -from databricks.koalas.utils import (column_labels_level, default_session, lazy_property, - name_like_string, scol_for) +from databricks.koalas.utils import ( + column_labels_level, + default_session, + lazy_property, + name_like_string, + scol_for, +) # A function to turn given numbers to Spark columns that represent Koalas index. @@ -50,7 +57,7 @@ # A pattern to check if the name of a Spark column is a Koalas index name or not. SPARK_INDEX_NAME_PATTERN = re.compile(r"__index_level_[0-9]+__") -NATURAL_ORDER_COLUMN_NAME = '__natural_order__' +NATURAL_ORDER_COLUMN_NAME = "__natural_order__" HIDDEN_COLUMNS = set([NATURAL_ORDER_COLUMN_NAME]) @@ -385,12 +392,15 @@ class _InternalFrame(object): 4 8 """ - def __init__(self, sdf: spark.DataFrame, - index_map: Optional[List[IndexMap]], - column_labels: Optional[List[Tuple[str, ...]]] = None, - column_scols: Optional[List[spark.Column]] = None, - column_label_names: Optional[List[str]] = None, - scol: Optional[spark.Column] = None) -> None: + def __init__( + self, + sdf: spark.DataFrame, + index_map: Optional[List[IndexMap]], + column_labels: Optional[List[Tuple[str, ...]]] = None, + column_scols: Optional[List[spark.Column]] = None, + column_label_names: Optional[List[str]] = None, + scol: Optional[spark.Column] = None, + ) -> None: """ Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and index fields and names. @@ -456,9 +466,10 @@ def __init__(self, sdf: spark.DataFrame, assert not sdf.isStreaming, "Koalas does not support Structured Streaming." if index_map is None: - assert not any(SPARK_INDEX_NAME_PATTERN.match(name) for name in sdf.columns), \ - "Index columns should not appear in columns of the Spark DataFrame. Avoid " \ + assert not any(SPARK_INDEX_NAME_PATTERN.match(name) for name in sdf.columns), ( + "Index columns should not appear in columns of the Spark DataFrame. Avoid " "index column names [%s]." % SPARK_INDEX_NAME_PATTERN + ) # Create default index. sdf = _InternalFrame.attach_default_index(sdf) @@ -467,11 +478,17 @@ def __init__(self, sdf: spark.DataFrame, if NATURAL_ORDER_COLUMN_NAME not in sdf.columns: sdf = sdf.withColumn(NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id()) - assert all(isinstance(index_field, str) - and (index_name is None or (isinstance(index_name, tuple) - and all(isinstance(name, str) - for name in index_name))) - for index_field, index_name in index_map), index_map + assert all( + isinstance(index_field, str) + and ( + index_name is None + or ( + isinstance(index_name, tuple) + and all(isinstance(name, str) for name in index_name) + ) + ) + for index_field, index_name in index_map + ), index_map assert scol is None or isinstance(scol, spark.Column) assert column_scols is None or all(isinstance(scol, spark.Column) for scol in column_scols) @@ -482,27 +499,34 @@ def __init__(self, sdf: spark.DataFrame, self._column_scols = [scol] elif column_scols is None: index_columns = set(index_column for index_column, _ in self._index_map) - self._column_scols = [scol_for(sdf, col) for col in sdf.columns - if col not in index_columns and col not in HIDDEN_COLUMNS] + self._column_scols = [ + scol_for(sdf, col) + for col in sdf.columns + if col not in index_columns and col not in HIDDEN_COLUMNS + ] else: self._column_scols = column_scols if scol is not None: assert column_labels is not None and len(column_labels) == 1, column_labels - assert all(label is None or (isinstance(label, tuple) and len(label) > 0) - for label in column_labels), column_labels + assert all( + label is None or (isinstance(label, tuple) and len(label) > 0) + for label in column_labels + ), column_labels self._column_labels = column_labels elif column_labels is None: self._column_labels = [(sdf.select(scol).columns[0],) for scol in self._column_scols] else: - assert len(column_labels) == len(self._column_scols), \ - (len(column_labels), len(self._column_scols)) + assert len(column_labels) == len(self._column_scols), ( + len(column_labels), + len(self._column_scols), + ) assert all(isinstance(i, tuple) for i in column_labels), column_labels assert len(set(len(i) for i in column_labels)) <= 1, column_labels self._column_labels = column_labels if column_label_names is not None and not is_list_like(column_label_names): - raise ValueError('Column_index_names should be list-like or None for a MultiIndex') + raise ValueError("Column_index_names should be list-like or None for a MultiIndex") if isinstance(column_label_names, list): if all(name is None for name in column_label_names): @@ -538,27 +562,28 @@ def attach_default_index(sdf, default_index_type=None): AssertionError: '__index_level_0__' already exists... """ index_column = SPARK_DEFAULT_INDEX_NAME - assert index_column not in sdf.columns, ( - "'%s' already exists in the Spark column names '%s'" % ( - index_column, sdf.columns)) + assert ( + index_column not in sdf.columns + ), "'%s' already exists in the Spark column names '%s'" % (index_column, sdf.columns) if default_index_type is None: default_index_type = get_option("compute.default_index_type") scols = [scol_for(sdf, column) for column in sdf.columns] if default_index_type == "sequence": - sequential_index = F.row_number().over( - Window.orderBy(F.monotonically_increasing_id())) - 1 + sequential_index = ( + F.row_number().over(Window.orderBy(F.monotonically_increasing_id())) - 1 + ) return sdf.select(sequential_index.alias(index_column), *scols) elif default_index_type == "distributed-sequence": - return _InternalFrame.attach_distributed_sequence_column( - sdf, column_name=index_column) + return _InternalFrame.attach_distributed_sequence_column(sdf, column_name=index_column) elif default_index_type == "distributed": - return sdf.select( - F.monotonically_increasing_id().alias(index_column), *scols) + return sdf.select(F.monotonically_increasing_id().alias(index_column), *scols) else: - raise ValueError("'compute.default_index_type' should be one of 'sequence'," - " 'distributed-sequence' and 'distributed'") + raise ValueError( + "'compute.default_index_type' should be one of 'sequence'," + " 'distributed-sequence' and 'distributed'" + ) @staticmethod def attach_distributed_sequence_column(sdf, column_name): @@ -588,8 +613,10 @@ def attach_distributed_sequence_column(sdf, column_name): # ... # } sdf = sdf.withColumn("__spark_partition_id", F.spark_partition_id()) - counts = map(lambda x: (x["key"], x["count"]), - sdf.groupby(sdf['__spark_partition_id'].alias("key")).count().collect()) + counts = map( + lambda x: (x["key"], x["count"]), + sdf.groupby(sdf["__spark_partition_id"].alias("key")).count().collect(), + ) # 2. Calculates cumulative sum in an order of partition id. # Note that it does not matter if partition id guarantees its order or not. @@ -608,16 +635,15 @@ def offset(id): current_partition_offset = sums[id.iloc[0]] return pd.Series(current_partition_offset).repeat(len(id)) - sdf = sdf.withColumn('__offset__', offset('__spark_partition_id')) + sdf = sdf.withColumn("__offset__", offset("__spark_partition_id")) # 4. Calculate row_number in each partition. - w = Window.partitionBy('__spark_partition_id').orderBy(F.monotonically_increasing_id()) + w = Window.partitionBy("__spark_partition_id").orderBy(F.monotonically_increasing_id()) row_number = F.row_number().over(w) - sdf = sdf.withColumn('__row_number__', row_number) + sdf = sdf.withColumn("__row_number__", row_number) # 5. Calculate the index. - return sdf.select( - F.expr('__offset__ + __row_number__ - 1').alias(column_name), *scols) + return sdf.select(F.expr("__offset__ + __row_number__ - 1").alias(column_name), *scols) @lazy_property def _column_labels_to_name(self) -> Dict[Tuple[str, ...], str]: @@ -645,8 +671,9 @@ def scol_for(self, column_labels_or_index_column: Union[str, Tuple[str, ...]]): raise KeyError(name_like_string(column_labels_or_index_column)) return scol_for(self._sdf, self.column_name_for(column_labels_or_index_column)) - def spark_type_for(self, - column_labels_or_index_column: Union[str, Tuple[str, ...]]) -> DataType: + def spark_type_for( + self, column_labels_or_index_column: Union[str, Tuple[str, ...]] + ) -> DataType: """ Return DataType for the given column name or index. """ return self._sdf.select(self.scol_for(column_labels_or_index_column)).schema[0].dataType @@ -679,15 +706,19 @@ def index_scols(self) -> List[spark.Column]: def columns(self) -> List[str]: """ Return all the field names including index field names. """ index_columns = set(self.index_columns) - return self.index_columns + [column for column in self.data_columns - if column not in index_columns] + return self.index_columns + [ + column for column in self.data_columns if column not in index_columns + ] @lazy_property def scols(self) -> List[spark.Column]: """ Return Spark Columns for the managed columns including index columns. """ index_columns = set(self.index_columns) - return self.index_scols + [self.scol_for(label) for label in self.column_labels - if self.column_name_for(label) not in index_columns] + return self.index_scols + [ + self.scol_for(label) + for label in self.column_labels + if self.column_name_for(label) not in index_columns + ] @property def index_map(self) -> List[IndexMap]: @@ -755,8 +786,9 @@ def pandas_df(self): sdf = self.spark_internal_df pdf = sdf.toPandas() if len(pdf) == 0 and len(sdf.schema) > 0: - pdf = pdf.astype({field.name: spark_type_to_pandas_dtype(field.dataType) - for field in sdf.schema}) + pdf = pdf.astype( + {field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema} + ) index_columns = self.index_columns if len(index_columns) > 0: @@ -765,10 +797,16 @@ def pandas_df(self): drop = index_field not in self.data_columns pdf = pdf.set_index(index_field, drop=drop, append=append) append = True - pdf = pdf[[col if col in index_columns - else str(i) if label is None else name_like_string(label) - for i, (col, label) - in enumerate(zip(self.data_columns, self.column_labels))]] + pdf = pdf[ + [ + col + if col in index_columns + else str(i) + if label is None + else name_like_string(label) + for i, (col, label) in enumerate(zip(self.data_columns, self.column_labels)) + ] + ] if self.column_labels_level > 1: pdf.columns = pd.MultiIndex.from_tuples(self._column_labels) @@ -779,12 +817,14 @@ def pandas_df(self): index_names = self.index_names if len(index_names) > 0: - pdf.index.names = [name if name is None or len(name) > 1 else name[0] - for name in index_names] + pdf.index.names = [ + name if name is None or len(name) > 1 else name[0] for name in index_names + ] return pdf - def with_new_sdf(self, sdf: spark.DataFrame, - data_columns: Optional[List[str]] = None) -> '_InternalFrame': + def with_new_sdf( + self, sdf: spark.DataFrame, data_columns: Optional[List[str]] = None + ) -> "_InternalFrame": """ Copy the immutable _InternalFrame with the updates by the specified Spark DataFrame. :param sdf: the new Spark DataFrame @@ -795,14 +835,19 @@ def with_new_sdf(self, sdf: spark.DataFrame, if data_columns is None: data_columns = self.data_columns else: - assert len(data_columns) == len(self.column_labels), \ - (len(data_columns), len(self.column_labels)) + assert len(data_columns) == len(self.column_labels), ( + len(data_columns), + len(self.column_labels), + ) sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME) return self.copy(sdf=sdf, column_scols=[scol_for(sdf, col) for col in data_columns]) - def with_new_columns(self, scols_or_ksers: List[Union[spark.Column, 'Series']], - column_labels: Optional[List[Tuple[str, ...]]] = None, - keep_order: bool = True) -> '_InternalFrame': + def with_new_columns( + self, + scols_or_ksers: List[Union[spark.Column, "Series"]], + column_labels: Optional[List[Tuple[str, ...]]] = None, + keep_order: bool = True, + ) -> "_InternalFrame": """ Copy the immutable _InternalFrame with the updates by the specified Spark Columns or Series. @@ -818,8 +863,10 @@ def with_new_columns(self, scols_or_ksers: List[Union[spark.Column, 'Series']], if all(isinstance(scol_or_kser, Series) for scol_or_kser in scols_or_ksers): column_labels = [kser._internal.column_labels[0] for kser in scols_or_ksers] else: - assert len(scols_or_ksers) == len(self.column_labels), \ - (len(scols_or_ksers), len(self.column_labels)) + assert len(scols_or_ksers) == len(self.column_labels), ( + len(scols_or_ksers), + len(self.column_labels), + ) column_labels = [] for scol_or_kser, label in zip(scols_or_ksers, self.column_labels): if isinstance(scol_or_kser, Series): @@ -827,8 +874,10 @@ def with_new_columns(self, scols_or_ksers: List[Union[spark.Column, 'Series']], else: column_labels.append(label) else: - assert len(scols_or_ksers) == len(column_labels), \ - (len(scols_or_ksers), len(column_labels)) + assert len(scols_or_ksers) == len(column_labels), ( + len(scols_or_ksers), + len(column_labels), + ) column_scols = [] for scol_or_kser, label in zip(scols_or_ksers, column_labels): @@ -848,15 +897,17 @@ def with_new_columns(self, scols_or_ksers: List[Union[spark.Column, 'Series']], sdf=sdf, column_labels=column_labels, column_scols=[scol_for(sdf, col) for col in self._sdf.select(column_scols).columns], - scol=None) + scol=None, + ) - def with_filter(self, pred: Union[spark.Column, 'Series']): + def with_filter(self, pred: Union[spark.Column, "Series"]): """ Copy the immutable _InternalFrame with the updates by the predicate. :param pred: the predicate to filter. :return: the copied _InternalFrame. """ from databricks.koalas.series import Series + if isinstance(pred, Series): assert isinstance(pred.spark_type, BooleanType), pred.spark_type pred = pred._scol @@ -866,12 +917,15 @@ def with_filter(self, pred: Union[spark.Column, 'Series']): return self.copy(sdf=self._sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(pred)) - def copy(self, sdf: Union[spark.DataFrame, _NoValueType] = _NoValue, - index_map: Union[List[IndexMap], _NoValueType] = _NoValue, - column_labels: Union[List[Tuple[str, ...]], _NoValueType] = _NoValue, - column_scols: Union[List[spark.Column], _NoValueType] = _NoValue, - column_label_names: Optional[Union[List[str], _NoValueType]] = _NoValue, - scol: Union[spark.Column, _NoValueType] = _NoValue) -> '_InternalFrame': + def copy( + self, + sdf: Union[spark.DataFrame, _NoValueType] = _NoValue, + index_map: Union[List[IndexMap], _NoValueType] = _NoValue, + column_labels: Union[List[Tuple[str, ...]], _NoValueType] = _NoValue, + column_scols: Union[List[spark.Column], _NoValueType] = _NoValue, + column_label_names: Optional[Union[List[str], _NoValueType]] = _NoValue, + scol: Union[spark.Column, _NoValueType] = _NoValue, + ) -> "_InternalFrame": """ Copy the immutable DataFrame. :param sdf: the new Spark DataFrame. If None, then the original one is used. @@ -894,12 +948,17 @@ def copy(self, sdf: Union[spark.DataFrame, _NoValueType] = _NoValue, column_label_names = self._column_label_names if scol is _NoValue: scol = self._scol - return _InternalFrame(sdf, index_map=index_map, column_labels=column_labels, - column_scols=column_scols, column_label_names=column_label_names, - scol=scol) + return _InternalFrame( + sdf, + index_map=index_map, + column_labels=column_labels, + column_scols=column_scols, + column_label_names=column_label_names, + scol=scol, + ) @staticmethod - def from_pandas(pdf: pd.DataFrame) -> '_InternalFrame': + def from_pandas(pdf: pd.DataFrame) -> "_InternalFrame": """ Create an immutable DataFrame from pandas DataFrame. :param pdf: :class:`pd.DataFrame` @@ -918,34 +977,48 @@ def from_pandas(pdf: pd.DataFrame) -> '_InternalFrame': index_map = [] # type: List[IndexMap] if isinstance(index, pd.MultiIndex): if index.names is None: - index_map = [(SPARK_INDEX_NAME_FORMAT(i), None) - for i in range(len(index.levels))] + index_map = [(SPARK_INDEX_NAME_FORMAT(i), None) for i in range(len(index.levels))] else: index_map = [ - (SPARK_INDEX_NAME_FORMAT(i) if name is None else name_like_string(name), - name if name is None or isinstance(name, tuple) else (name,)) - for i, name in enumerate(index.names)] + ( + SPARK_INDEX_NAME_FORMAT(i) if name is None else name_like_string(name), + name if name is None or isinstance(name, tuple) else (name,), + ) + for i, name in enumerate(index.names) + ] else: name = index.name - index_map = [(name_like_string(name) - if name is not None else SPARK_DEFAULT_INDEX_NAME, - name if name is None or isinstance(name, tuple) else (name,))] + index_map = [ + ( + name_like_string(name) if name is not None else SPARK_DEFAULT_INDEX_NAME, + name if name is None or isinstance(name, tuple) else (name,), + ) + ] index_columns = [index_column for index_column, _ in index_map] reset_index = pdf.reset_index() reset_index.columns = index_columns + data_columns - schema = StructType([StructField(name_like_string(name), infer_pd_series_spark_type(col), - nullable=bool(col.isnull().any())) - for name, col in reset_index.iteritems()]) + schema = StructType( + [ + StructField( + name_like_string(name), + infer_pd_series_spark_type(col), + nullable=bool(col.isnull().any()), + ) + for name, col in reset_index.iteritems() + ] + ) for name, col in reset_index.iteritems(): dt = col.dtype if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): continue reset_index[name] = col.replace({np.nan: None}) sdf = default_session().createDataFrame(reset_index, schema=schema) - return _InternalFrame(sdf=sdf, - index_map=index_map, - column_labels=column_labels, - column_scols=[scol_for(sdf, col) for col in data_columns], - column_label_names=column_label_names) + return _InternalFrame( + sdf=sdf, + index_map=index_map, + column_labels=column_labels, + column_scols=[scol_for(sdf, col) for col in data_columns], + column_label_names=column_label_names, + ) diff --git a/databricks/koalas/missing/__init__.py b/databricks/koalas/missing/__init__.py index 1c788f1..3082d6b 100644 --- a/databricks/koalas/missing/__init__.py +++ b/databricks/koalas/missing/__init__.py @@ -18,28 +18,30 @@ def _unsupported_function(class_name, method_name, deprecated=False, reason=""): - def unsupported_function(*args, **kwargs): - raise PandasNotImplementedError(class_name=class_name, method_name=method_name, - reason=reason) + raise PandasNotImplementedError( + class_name=class_name, method_name=method_name, reason=reason + ) def deprecated_function(*args, **kwargs): - raise PandasNotImplementedError(class_name=class_name, method_name=method_name, - deprecated=deprecated, reason=reason) + raise PandasNotImplementedError( + class_name=class_name, method_name=method_name, deprecated=deprecated, reason=reason + ) return deprecated_function if deprecated else unsupported_function def _unsupported_property(class_name, property_name, deprecated=False, reason=""): - @property def unsupported_property(self): - raise PandasNotImplementedError(class_name=class_name, property_name=property_name, - reason=reason) + raise PandasNotImplementedError( + class_name=class_name, property_name=property_name, reason=reason + ) @property def deprecated_property(self): - raise PandasNotImplementedError(class_name=class_name, property_name=property_name, - deprecated=deprecated, reason=reason) + raise PandasNotImplementedError( + class_name=class_name, property_name=property_name, deprecated=deprecated, reason=reason + ) return deprecated_property if deprecated else unsupported_property diff --git a/databricks/koalas/missing/common.py b/databricks/koalas/missing/common.py index 8dd1e45..855b9a6 100644 --- a/databricks/koalas/missing/common.py +++ b/databricks/koalas/missing/common.py @@ -16,41 +16,47 @@ memory_usage = lambda f: f( - 'memory_usage', + "memory_usage", reason="Unlike pandas, most DataFrames are not materialized in memory in Spark " - "(and Koalas), and as a result memory_usage() does not do what you intend it " - "to do. Use Spark's web UI to monitor disk and memory usage of your application.") + "(and Koalas), and as a result memory_usage() does not do what you intend it " + "to do. Use Spark's web UI to monitor disk and memory usage of your application.", +) values = lambda f: f( - 'values', - reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead.") + "values", reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead." +) array = lambda f: f( - 'array', - reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead.") + "array", reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead." +) to_pickle = lambda f: f( - 'to_pickle', + "to_pickle", reason="For storage, we encourage you to use Delta or Parquet, instead of Python pickle " - "format.") + "format.", +) to_xarray = lambda f: f( - 'to_xarray', - reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead.") + "to_xarray", + reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead.", +) to_list = lambda f: f( - 'to_list', - reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead.") + "to_list", + reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead.", +) tolist = lambda f: f( - 'tolist', - reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead.") + "tolist", reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead." +) __iter__ = lambda f: f( - '__iter__', - reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead.") + "__iter__", + reason="If you want to collect your data as an NumPy array, use 'to_numpy()' instead.", +) duplicated = lambda f: f( - 'duplicated', + "duplicated", reason="'duplicated' API returns np.ndarray and the data size is too large." - "You can just use DataFrame.deduplicated instead") + "You can just use DataFrame.deduplicated instead", +) diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py index 9e79aa1..8dad12a 100644 --- a/databricks/koalas/missing/frame.py +++ b/databricks/koalas/missing/frame.py @@ -18,78 +18,80 @@ def unsupported_function(method_name, deprecated=False, reason=""): - return _unsupported_function(class_name='pd.DataFrame', method_name=method_name, - deprecated=deprecated, reason=reason) + return _unsupported_function( + class_name="pd.DataFrame", method_name=method_name, deprecated=deprecated, reason=reason + ) def unsupported_property(property_name, deprecated=False, reason=""): - return _unsupported_property(class_name='pd.DataFrame', property_name=property_name, - deprecated=deprecated, reason=reason) + return _unsupported_property( + class_name="pd.DataFrame", property_name=property_name, deprecated=deprecated, reason=reason + ) class _MissingPandasLikeDataFrame(object): # Functions - align = unsupported_function('align') - asfreq = unsupported_function('asfreq') - asof = unsupported_function('asof') - at_time = unsupported_function('at_time') - between_time = unsupported_function('between_time') - boxplot = unsupported_function('boxplot') - combine = unsupported_function('combine') - combine_first = unsupported_function('combine_first') - corrwith = unsupported_function('corrwith') - cov = unsupported_function('cov') - dot = unsupported_function('dot') - droplevel = unsupported_function('droplevel') - eval = unsupported_function('eval') - ewm = unsupported_function('ewm') - first = unsupported_function('first') - infer_objects = unsupported_function('infer_objects') - insert = unsupported_function('insert') - interpolate = unsupported_function('interpolate') - itertuples = unsupported_function('itertuples') - last = unsupported_function('last') - last_valid_index = unsupported_function('last_valid_index') - lookup = unsupported_function('lookup') - mad = unsupported_function('mad') - mode = unsupported_function('mode') - prod = unsupported_function('prod') - product = unsupported_function('product') - reindex_like = unsupported_function('reindex_like') - rename_axis = unsupported_function('rename_axis') - reorder_levels = unsupported_function('reorder_levels') - resample = unsupported_function('resample') - sem = unsupported_function('sem') - set_axis = unsupported_function('set_axis') - slice_shift = unsupported_function('slice_shift') - squeeze = unsupported_function('squeeze') - stack = unsupported_function('stack') - swapaxes = unsupported_function('swapaxes') - swaplevel = unsupported_function('swaplevel') - tail = unsupported_function('tail') - take = unsupported_function('take') - to_feather = unsupported_function('to_feather') - to_gbq = unsupported_function('to_gbq') - to_hdf = unsupported_function('to_hdf') - to_period = unsupported_function('to_period') - to_sql = unsupported_function('to_sql') - to_stata = unsupported_function('to_stata') - to_timestamp = unsupported_function('to_timestamp') - to_markdown = unsupported_function('to_markdown') - truncate = unsupported_function('truncate') - tshift = unsupported_function('tshift') - tz_convert = unsupported_function('tz_convert') - tz_localize = unsupported_function('tz_localize') - unstack = unsupported_function('unstack') + align = unsupported_function("align") + asfreq = unsupported_function("asfreq") + asof = unsupported_function("asof") + at_time = unsupported_function("at_time") + between_time = unsupported_function("between_time") + boxplot = unsupported_function("boxplot") + combine = unsupported_function("combine") + combine_first = unsupported_function("combine_first") + corrwith = unsupported_function("corrwith") + cov = unsupported_function("cov") + dot = unsupported_function("dot") + droplevel = unsupported_function("droplevel") + eval = unsupported_function("eval") + ewm = unsupported_function("ewm") + first = unsupported_function("first") + infer_objects = unsupported_function("infer_objects") + insert = unsupported_function("insert") + interpolate = unsupported_function("interpolate") + itertuples = unsupported_function("itertuples") + last = unsupported_function("last") + last_valid_index = unsupported_function("last_valid_index") + lookup = unsupported_function("lookup") + mad = unsupported_function("mad") + mode = unsupported_function("mode") + prod = unsupported_function("prod") + product = unsupported_function("product") + reindex_like = unsupported_function("reindex_like") + rename_axis = unsupported_function("rename_axis") + reorder_levels = unsupported_function("reorder_levels") + resample = unsupported_function("resample") + sem = unsupported_function("sem") + set_axis = unsupported_function("set_axis") + slice_shift = unsupported_function("slice_shift") + squeeze = unsupported_function("squeeze") + stack = unsupported_function("stack") + swapaxes = unsupported_function("swapaxes") + swaplevel = unsupported_function("swaplevel") + tail = unsupported_function("tail") + take = unsupported_function("take") + to_feather = unsupported_function("to_feather") + to_gbq = unsupported_function("to_gbq") + to_hdf = unsupported_function("to_hdf") + to_period = unsupported_function("to_period") + to_sql = unsupported_function("to_sql") + to_stata = unsupported_function("to_stata") + to_timestamp = unsupported_function("to_timestamp") + to_markdown = unsupported_function("to_markdown") + truncate = unsupported_function("truncate") + tshift = unsupported_function("tshift") + tz_convert = unsupported_function("tz_convert") + tz_localize = unsupported_function("tz_localize") + unstack = unsupported_function("unstack") # Deprecated functions - convert_objects = unsupported_function('convert_objects', deprecated=True) - select = unsupported_function('select', deprecated=True) - to_panel = unsupported_function('to_panel', deprecated=True) - get_values = unsupported_function('get_values', deprecated=True) - compound = unsupported_function('compound', deprecated=True) - reindex_axis = unsupported_function('reindex_axis', deprecated=True) + convert_objects = unsupported_function("convert_objects", deprecated=True) + select = unsupported_function("select", deprecated=True) + to_panel = unsupported_function("to_panel", deprecated=True) + get_values = unsupported_function("get_values", deprecated=True) + compound = unsupported_function("compound", deprecated=True) + reindex_axis = unsupported_function("reindex_axis", deprecated=True) # Properties we won't support. values = common.values(unsupported_property) diff --git a/databricks/koalas/missing/groupby.py b/databricks/koalas/missing/groupby.py index cc273aa..f67509e 100644 --- a/databricks/koalas/missing/groupby.py +++ b/databricks/koalas/missing/groupby.py @@ -18,87 +18,95 @@ def unsupported_function(method_name, deprecated=False, reason=""): - return _unsupported_function(class_name='pd.groupby.GroupBy', method_name=method_name, - deprecated=deprecated, reason=reason) + return _unsupported_function( + class_name="pd.groupby.GroupBy", + method_name=method_name, + deprecated=deprecated, + reason=reason, + ) def unsupported_property(property_name, deprecated=False, reason=""): - return _unsupported_property(class_name='pd.groupby.GroupBy', property_name=property_name, - deprecated=deprecated, reason=reason) + return _unsupported_property( + class_name="pd.groupby.GroupBy", + property_name=property_name, + deprecated=deprecated, + reason=reason, + ) class _MissingPandasLikeDataFrameGroupBy(object): # Properties - corr = unsupported_property('corr') - corrwith = unsupported_property('corrwith') - cov = unsupported_property('cov') - dtypes = unsupported_property('dtypes') - groups = unsupported_property('groups') - hist = unsupported_property('hist') - indices = unsupported_property('indices') - mad = unsupported_property('mad') - ngroups = unsupported_property('ngroups') - plot = unsupported_property('plot') - quantile = unsupported_property('quantile') - skew = unsupported_property('skew') - tshift = unsupported_property('tshift') + corr = unsupported_property("corr") + corrwith = unsupported_property("corrwith") + cov = unsupported_property("cov") + dtypes = unsupported_property("dtypes") + groups = unsupported_property("groups") + hist = unsupported_property("hist") + indices = unsupported_property("indices") + mad = unsupported_property("mad") + ngroups = unsupported_property("ngroups") + plot = unsupported_property("plot") + quantile = unsupported_property("quantile") + skew = unsupported_property("skew") + tshift = unsupported_property("tshift") # Deprecated properties - take = unsupported_property('take', deprecated=True) + take = unsupported_property("take", deprecated=True) # Functions - boxplot = unsupported_function('boxplot') - cumcount = unsupported_function('cumcount') - get_group = unsupported_function('get_group') - median = unsupported_function('median') - ngroup = unsupported_function('ngroup') - nth = unsupported_function('nth') - ohlc = unsupported_function('ohlc') - pct_change = unsupported_function('pct_change') - pipe = unsupported_function('pipe') - prod = unsupported_function('prod') - resample = unsupported_function('resample') - sem = unsupported_function('sem') - tail = unsupported_function('tail') + boxplot = unsupported_function("boxplot") + cumcount = unsupported_function("cumcount") + get_group = unsupported_function("get_group") + median = unsupported_function("median") + ngroup = unsupported_function("ngroup") + nth = unsupported_function("nth") + ohlc = unsupported_function("ohlc") + pct_change = unsupported_function("pct_change") + pipe = unsupported_function("pipe") + prod = unsupported_function("prod") + resample = unsupported_function("resample") + sem = unsupported_function("sem") + tail = unsupported_function("tail") class _MissingPandasLikeSeriesGroupBy(object): # Properties - corr = unsupported_property('corr') - cov = unsupported_property('cov') - dtype = unsupported_property('dtype') - groups = unsupported_property('groups') - hist = unsupported_property('hist') - indices = unsupported_property('indices') - is_monotonic_decreasing = unsupported_property('is_monotonic_decreasing') - is_monotonic_increasing = unsupported_property('is_monotonic_increasing') - mad = unsupported_property('mad') - ngroups = unsupported_property('ngroups') - plot = unsupported_property('plot') - quantile = unsupported_property('quantile') - skew = unsupported_property('skew') - tshift = unsupported_property('tshift') - unique = unsupported_property('unique') + corr = unsupported_property("corr") + cov = unsupported_property("cov") + dtype = unsupported_property("dtype") + groups = unsupported_property("groups") + hist = unsupported_property("hist") + indices = unsupported_property("indices") + is_monotonic_decreasing = unsupported_property("is_monotonic_decreasing") + is_monotonic_increasing = unsupported_property("is_monotonic_increasing") + mad = unsupported_property("mad") + ngroups = unsupported_property("ngroups") + plot = unsupported_property("plot") + quantile = unsupported_property("quantile") + skew = unsupported_property("skew") + tshift = unsupported_property("tshift") + unique = unsupported_property("unique") # Deprecated properties - take = unsupported_property('take', deprecated=True) + take = unsupported_property("take", deprecated=True) # Functions - agg = unsupported_function('agg') - aggregate = unsupported_function('aggregate') - cumcount = unsupported_function('cumcount') - describe = unsupported_function('describe') - filter = unsupported_function('filter') - get_group = unsupported_function('get_group') - median = unsupported_function('median') - ngroup = unsupported_function('ngroup') - nth = unsupported_function('nth') - ohlc = unsupported_function('ohlc') - pct_change = unsupported_function('pct_change') - pipe = unsupported_function('pipe') - prod = unsupported_function('prod') - resample = unsupported_function('resample') - sem = unsupported_function('sem') - tail = unsupported_function('tail') + agg = unsupported_function("agg") + aggregate = unsupported_function("aggregate") + cumcount = unsupported_function("cumcount") + describe = unsupported_function("describe") + filter = unsupported_function("filter") + get_group = unsupported_function("get_group") + median = unsupported_function("median") + ngroup = unsupported_function("ngroup") + nth = unsupported_function("nth") + ohlc = unsupported_function("ohlc") + pct_change = unsupported_function("pct_change") + pipe = unsupported_function("pipe") + prod = unsupported_function("prod") + resample = unsupported_function("resample") + sem = unsupported_function("sem") + tail = unsupported_function("tail") diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py index 5054815..e5bcfef 100644 --- a/databricks/koalas/missing/indexes.py +++ b/databricks/koalas/missing/indexes.py @@ -18,65 +18,67 @@ def unsupported_function(method_name, deprecated=False, reason=""): - return _unsupported_function(class_name='pd.Index', method_name=method_name, - deprecated=deprecated, reason=reason) + return _unsupported_function( + class_name="pd.Index", method_name=method_name, deprecated=deprecated, reason=reason + ) def unsupported_property(property_name, deprecated=False, reason=""): - return _unsupported_property(class_name='pd.Index', property_name=property_name, - deprecated=deprecated, reason=reason) + return _unsupported_property( + class_name="pd.Index", property_name=property_name, deprecated=deprecated, reason=reason + ) class _MissingPandasLikeIndex(object): # Properties - nbytes = unsupported_property('nbytes') + nbytes = unsupported_property("nbytes") # Functions - argsort = unsupported_function('argsort') - asof = unsupported_function('asof') - asof_locs = unsupported_function('asof_locs') - delete = unsupported_function('delete') - difference = unsupported_function('difference') - droplevel = unsupported_function('droplevel') - factorize = unsupported_function('factorize') - format = unsupported_function('format') - get_indexer = unsupported_function('get_indexer') - get_indexer_for = unsupported_function('get_indexer_for') - get_indexer_non_unique = unsupported_function('get_indexer_non_unique') - get_level_values = unsupported_function('get_level_values') - get_loc = unsupported_function('get_loc') - get_slice_bound = unsupported_function('get_slice_bound') - get_value = unsupported_function('get_value') - groupby = unsupported_function('groupby') - holds_integer = unsupported_function('holds_integer') - insert = unsupported_function('insert') - intersection = unsupported_function('intersection') - is_ = unsupported_function('is_') - is_lexsorted_for_tuple = unsupported_function('is_lexsorted_for_tuple') - is_mixed = unsupported_function('is_mixed') - is_type_compatible = unsupported_function('is_type_compatible') - join = unsupported_function('join') - map = unsupported_function('map') - putmask = unsupported_function('putmask') - ravel = unsupported_function('ravel') - reindex = unsupported_function('reindex') - repeat = unsupported_function('repeat') - searchsorted = unsupported_function('searchsorted') - slice_indexer = unsupported_function('slice_indexer') - slice_locs = unsupported_function('slice_locs') - sortlevel = unsupported_function('sortlevel') - take = unsupported_function('take') - to_flat_index = unsupported_function('to_flat_index') - to_native_types = unsupported_function('to_native_types') - union = unsupported_function('union') - view = unsupported_function('view') - where = unsupported_function('where') + argsort = unsupported_function("argsort") + asof = unsupported_function("asof") + asof_locs = unsupported_function("asof_locs") + delete = unsupported_function("delete") + difference = unsupported_function("difference") + droplevel = unsupported_function("droplevel") + factorize = unsupported_function("factorize") + format = unsupported_function("format") + get_indexer = unsupported_function("get_indexer") + get_indexer_for = unsupported_function("get_indexer_for") + get_indexer_non_unique = unsupported_function("get_indexer_non_unique") + get_level_values = unsupported_function("get_level_values") + get_loc = unsupported_function("get_loc") + get_slice_bound = unsupported_function("get_slice_bound") + get_value = unsupported_function("get_value") + groupby = unsupported_function("groupby") + holds_integer = unsupported_function("holds_integer") + insert = unsupported_function("insert") + intersection = unsupported_function("intersection") + is_ = unsupported_function("is_") + is_lexsorted_for_tuple = unsupported_function("is_lexsorted_for_tuple") + is_mixed = unsupported_function("is_mixed") + is_type_compatible = unsupported_function("is_type_compatible") + join = unsupported_function("join") + map = unsupported_function("map") + putmask = unsupported_function("putmask") + ravel = unsupported_function("ravel") + reindex = unsupported_function("reindex") + repeat = unsupported_function("repeat") + searchsorted = unsupported_function("searchsorted") + slice_indexer = unsupported_function("slice_indexer") + slice_locs = unsupported_function("slice_locs") + sortlevel = unsupported_function("sortlevel") + take = unsupported_function("take") + to_flat_index = unsupported_function("to_flat_index") + to_native_types = unsupported_function("to_native_types") + union = unsupported_function("union") + view = unsupported_function("view") + where = unsupported_function("where") # Deprecated functions - get_values = unsupported_function('get_values', deprecated=True) - item = unsupported_function('item', deprecated=True) - set_value = unsupported_function('set_value') + get_values = unsupported_function("get_values", deprecated=True) + item = unsupported_function("item", deprecated=True) + set_value = unsupported_function("set_value") # Properties we won't support. values = common.values(unsupported_property) @@ -93,80 +95,82 @@ class _MissingPandasLikeIndex(object): class _MissingPandasLikeMultiIndex(object): # Deprecated properties - strides = unsupported_property('strides', deprecated=True) - data = unsupported_property('data', deprecated=True) - itemsize = unsupported_property('itemsize', deprecated=True) + strides = unsupported_property("strides", deprecated=True) + data = unsupported_property("data", deprecated=True) + itemsize = unsupported_property("itemsize", deprecated=True) # Functions - argsort = unsupported_function('argsort') - asof = unsupported_function('asof') - asof_locs = unsupported_function('asof_locs') - delete = unsupported_function('delete') - difference = unsupported_function('difference') - droplevel = unsupported_function('droplevel') - equal_levels = unsupported_function('equal_levels') - factorize = unsupported_function('factorize') - format = unsupported_function('format') - get_indexer = unsupported_function('get_indexer') - get_indexer_for = unsupported_function('get_indexer_for') - get_indexer_non_unique = unsupported_function('get_indexer_non_unique') - get_level_values = unsupported_function('get_level_values') - get_loc = unsupported_function('get_loc') - get_loc_level = unsupported_function('get_loc_level') - get_locs = unsupported_function('get_locs') - get_slice_bound = unsupported_function('get_slice_bound') - get_value = unsupported_function('get_value') - groupby = unsupported_function('groupby') - holds_integer = unsupported_function('holds_integer') - insert = unsupported_function('insert') - intersection = unsupported_function('intersection') - is_ = unsupported_function('is_') - is_lexsorted = unsupported_function('is_lexsorted') - is_lexsorted_for_tuple = unsupported_function('is_lexsorted_for_tuple') - is_mixed = unsupported_function('is_mixed') - is_type_compatible = unsupported_function('is_type_compatible') - join = unsupported_function('join') - map = unsupported_function('map') - putmask = unsupported_function('putmask') - ravel = unsupported_function('ravel') - reindex = unsupported_function('reindex') - remove_unused_levels = unsupported_function('remove_unused_levels') - reorder_levels = unsupported_function('reorder_levels') - repeat = unsupported_function('repeat') - searchsorted = unsupported_function('searchsorted') - set_codes = unsupported_function('set_codes') - set_levels = unsupported_function('set_levels') - slice_indexer = unsupported_function('slice_indexer') - slice_locs = unsupported_function('slice_locs') - sortlevel = unsupported_function('sortlevel') - take = unsupported_function('take') - to_flat_index = unsupported_function('to_flat_index') - to_native_types = unsupported_function('to_native_types') - truncate = unsupported_function('truncate') - union = unsupported_function('union') - view = unsupported_function('view') - where = unsupported_function('where') + argsort = unsupported_function("argsort") + asof = unsupported_function("asof") + asof_locs = unsupported_function("asof_locs") + delete = unsupported_function("delete") + difference = unsupported_function("difference") + droplevel = unsupported_function("droplevel") + equal_levels = unsupported_function("equal_levels") + factorize = unsupported_function("factorize") + format = unsupported_function("format") + get_indexer = unsupported_function("get_indexer") + get_indexer_for = unsupported_function("get_indexer_for") + get_indexer_non_unique = unsupported_function("get_indexer_non_unique") + get_level_values = unsupported_function("get_level_values") + get_loc = unsupported_function("get_loc") + get_loc_level = unsupported_function("get_loc_level") + get_locs = unsupported_function("get_locs") + get_slice_bound = unsupported_function("get_slice_bound") + get_value = unsupported_function("get_value") + groupby = unsupported_function("groupby") + holds_integer = unsupported_function("holds_integer") + insert = unsupported_function("insert") + intersection = unsupported_function("intersection") + is_ = unsupported_function("is_") + is_lexsorted = unsupported_function("is_lexsorted") + is_lexsorted_for_tuple = unsupported_function("is_lexsorted_for_tuple") + is_mixed = unsupported_function("is_mixed") + is_type_compatible = unsupported_function("is_type_compatible") + join = unsupported_function("join") + map = unsupported_function("map") + putmask = unsupported_function("putmask") + ravel = unsupported_function("ravel") + reindex = unsupported_function("reindex") + remove_unused_levels = unsupported_function("remove_unused_levels") + reorder_levels = unsupported_function("reorder_levels") + repeat = unsupported_function("repeat") + searchsorted = unsupported_function("searchsorted") + set_codes = unsupported_function("set_codes") + set_levels = unsupported_function("set_levels") + slice_indexer = unsupported_function("slice_indexer") + slice_locs = unsupported_function("slice_locs") + sortlevel = unsupported_function("sortlevel") + take = unsupported_function("take") + to_flat_index = unsupported_function("to_flat_index") + to_native_types = unsupported_function("to_native_types") + truncate = unsupported_function("truncate") + union = unsupported_function("union") + view = unsupported_function("view") + where = unsupported_function("where") # Deprecated functions - get_duplicates = unsupported_function('get_duplicates', deprecated=True) - get_values = unsupported_function('get_values', deprecated=True) - item = unsupported_function('item', deprecated=True) - set_value = unsupported_function('set_value', deprecated=True) + get_duplicates = unsupported_function("get_duplicates", deprecated=True) + get_values = unsupported_function("get_values", deprecated=True) + item = unsupported_function("item", deprecated=True) + set_value = unsupported_function("set_value", deprecated=True) # Functions we won't support. values = common.values(unsupported_property) array = common.array(unsupported_property) duplicated = common.duplicated(unsupported_property) codes = unsupported_property( - 'codes', + "codes", reason="'codes' requires to collect all data into the driver which is against the " - "design principle of Koalas. Alternatively, you could call 'to_pandas()' and" - " use 'codes' property in pandas.") + "design principle of Koalas. Alternatively, you could call 'to_pandas()' and" + " use 'codes' property in pandas.", + ) levels = unsupported_property( - 'levels', + "levels", reason="'levels' requires to collect all data into the driver which is against the " - "design principle of Koalas. Alternatively, you could call 'to_pandas()' and" - " use 'levels' property in pandas.") + "design principle of Koalas. Alternatively, you could call 'to_pandas()' and" + " use 'levels' property in pandas.", + ) __iter__ = common.__iter__(unsupported_function) # Properties we won't support. diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py index e882708..e3720a9 100644 --- a/databricks/koalas/missing/series.py +++ b/databricks/koalas/missing/series.py @@ -18,89 +18,92 @@ def unsupported_function(method_name, deprecated=False, reason=""): - return _unsupported_function(class_name='pd.Series', method_name=method_name, - deprecated=deprecated, reason=reason) + return _unsupported_function( + class_name="pd.Series", method_name=method_name, deprecated=deprecated, reason=reason + ) def unsupported_property(property_name, deprecated=False, reason=""): - return _unsupported_property(class_name='pd.Series', property_name=property_name, - deprecated=deprecated, reason=reason) + return _unsupported_property( + class_name="pd.Series", property_name=property_name, deprecated=deprecated, reason=reason + ) class _MissingPandasLikeSeries(object): # Functions - align = unsupported_function('align') - argsort = unsupported_function('argsort') - asfreq = unsupported_function('asfreq') - asof = unsupported_function('asof') - at_time = unsupported_function('at_time') - autocorr = unsupported_function('autocorr') - between_time = unsupported_function('between_time') - bfill = unsupported_function('bfill') - combine = unsupported_function('combine') - combine_first = unsupported_function('combine_first') - cov = unsupported_function('cov') - divmod = unsupported_function('divmod') - dot = unsupported_function('dot') - droplevel = unsupported_function('droplevel') - ewm = unsupported_function('ewm') - factorize = unsupported_function('factorize') - ffill = unsupported_function('ffill') - filter = unsupported_function('filter') - first = unsupported_function('first') - infer_objects = unsupported_function('infer_objects') - interpolate = unsupported_function('interpolate') - item = unsupported_function('item') - items = unsupported_function('items') - iteritems = unsupported_function('iteritems') - last = unsupported_function('last') - last_valid_index = unsupported_function('last_valid_index') - mad = unsupported_function('mad') - prod = unsupported_function('prod') - product = unsupported_function('product') - rdivmod = unsupported_function('rdivmod') - reindex = unsupported_function('reindex') - reindex_like = unsupported_function('reindex_like') - rename_axis = unsupported_function('rename_axis') - reorder_levels = unsupported_function('reorder_levels') - repeat = unsupported_function('repeat') - resample = unsupported_function('resample') - searchsorted = unsupported_function('searchsorted') - sem = unsupported_function('sem') - set_axis = unsupported_function('set_axis') - slice_shift = unsupported_function('slice_shift') - squeeze = unsupported_function('squeeze') - swapaxes = unsupported_function('swapaxes') - swaplevel = unsupported_function('swaplevel') - tail = unsupported_function('tail') - take = unsupported_function('take') - to_hdf = unsupported_function('to_hdf') - to_period = unsupported_function('to_period') - to_sql = unsupported_function('to_sql') - to_timestamp = unsupported_function('to_timestamp') - tshift = unsupported_function('tshift') - tz_convert = unsupported_function('tz_convert') - tz_localize = unsupported_function('tz_localize') - unstack = unsupported_function('unstack') - view = unsupported_function('view') + align = unsupported_function("align") + argsort = unsupported_function("argsort") + asfreq = unsupported_function("asfreq") + asof = unsupported_function("asof") + at_time = unsupported_function("at_time") + autocorr = unsupported_function("autocorr") + between_time = unsupported_function("between_time") + bfill = unsupported_function("bfill") + combine = unsupported_function("combine") + combine_first = unsupported_function("combine_first") + cov = unsupported_function("cov") + divmod = unsupported_function("divmod") + dot = unsupported_function("dot") + droplevel = unsupported_function("droplevel") + ewm = unsupported_function("ewm") + factorize = unsupported_function("factorize") + ffill = unsupported_function("ffill") + filter = unsupported_function("filter") + first = unsupported_function("first") + infer_objects = unsupported_function("infer_objects") + interpolate = unsupported_function("interpolate") + item = unsupported_function("item") + items = unsupported_function("items") + iteritems = unsupported_function("iteritems") + last = unsupported_function("last") + last_valid_index = unsupported_function("last_valid_index") + mad = unsupported_function("mad") + prod = unsupported_function("prod") + product = unsupported_function("product") + rdivmod = unsupported_function("rdivmod") + reindex = unsupported_function("reindex") + reindex_like = unsupported_function("reindex_like") + rename_axis = unsupported_function("rename_axis") + reorder_levels = unsupported_function("reorder_levels") + repeat = unsupported_function("repeat") + resample = unsupported_function("resample") + searchsorted = unsupported_function("searchsorted") + sem = unsupported_function("sem") + set_axis = unsupported_function("set_axis") + slice_shift = unsupported_function("slice_shift") + squeeze = unsupported_function("squeeze") + swapaxes = unsupported_function("swapaxes") + swaplevel = unsupported_function("swaplevel") + tail = unsupported_function("tail") + take = unsupported_function("take") + to_hdf = unsupported_function("to_hdf") + to_period = unsupported_function("to_period") + to_sql = unsupported_function("to_sql") + to_timestamp = unsupported_function("to_timestamp") + tshift = unsupported_function("tshift") + tz_convert = unsupported_function("tz_convert") + tz_localize = unsupported_function("tz_localize") + unstack = unsupported_function("unstack") + view = unsupported_function("view") # Deprecated functions - convert_objects = unsupported_function('convert_objects', deprecated=True) - nonzero = unsupported_function('nonzero', deprecated=True) - reindex_axis = unsupported_function('reindex_axis', deprecated=True) - select = unsupported_function('select', deprecated=True) - get_values = unsupported_function('get_values', deprecated=True) + convert_objects = unsupported_function("convert_objects", deprecated=True) + nonzero = unsupported_function("nonzero", deprecated=True) + reindex_axis = unsupported_function("reindex_axis", deprecated=True) + select = unsupported_function("select", deprecated=True) + get_values = unsupported_function("get_values", deprecated=True) # Properties we won't support. values = common.values(unsupported_property) array = common.array(unsupported_property) duplicated = common.duplicated(unsupported_property) nbytes = unsupported_property( - 'nbytes', + "nbytes", reason="'nbytes' requires to compute whole dataset. You can calculate manually it, " - "with its 'itemsize', by explicitly executing its count. Use Spark's web UI " - "to monitor disk and memory usage of your application in general.") + "with its 'itemsize', by explicitly executing its count. Use Spark's web UI " + "to monitor disk and memory usage of your application in general.", + ) # Functions we won't support. memory_usage = common.memory_usage(unsupported_function) @@ -108,6 +111,7 @@ class _MissingPandasLikeSeries(object): to_xarray = common.to_xarray(unsupported_function) __iter__ = common.__iter__(unsupported_function) ravel = unsupported_function( - 'ravel', + "ravel", reason="If you want to collect your flattened underlying data as an NumPy array, " - "use 'to_numpy().ravel()' instead.") + "use 'to_numpy().ravel()' instead.", + ) diff --git a/databricks/koalas/missing/window.py b/databricks/koalas/missing/window.py index c1ae735..5947fb4 100644 --- a/databricks/koalas/missing/window.py +++ b/databricks/koalas/missing/window.py @@ -18,25 +18,39 @@ def unsupported_function_expanding(method_name, deprecated=False, reason=""): - return _unsupported_function(class_name='pandas.core.window.Expanding', method_name=method_name, - deprecated=deprecated, reason=reason) + return _unsupported_function( + class_name="pandas.core.window.Expanding", + method_name=method_name, + deprecated=deprecated, + reason=reason, + ) def unsupported_property_expanding(property_name, deprecated=False, reason=""): return _unsupported_property( - class_name='pandas.core.window.Expanding', property_name=property_name, - deprecated=deprecated, reason=reason) + class_name="pandas.core.window.Expanding", + property_name=property_name, + deprecated=deprecated, + reason=reason, + ) def unsupported_function_rolling(method_name, deprecated=False, reason=""): - return _unsupported_function(class_name='pandas.core.window.Rolling', method_name=method_name, - deprecated=deprecated, reason=reason) + return _unsupported_function( + class_name="pandas.core.window.Rolling", + method_name=method_name, + deprecated=deprecated, + reason=reason, + ) def unsupported_property_rolling(property_name, deprecated=False, reason=""): return _unsupported_property( - class_name='pandas.core.window.Rolling', property_name=property_name, - deprecated=deprecated, reason=reason) + class_name="pandas.core.window.Rolling", + property_name=property_name, + deprecated=deprecated, + reason=reason, + ) class _MissingPandasLikeExpanding(object): diff --git a/databricks/koalas/ml.py b/databricks/koalas/ml.py index 15ff1b7..d50caef 100644 --- a/databricks/koalas/ml.py +++ b/databricks/koalas/ml.py @@ -29,10 +29,10 @@ import databricks.koalas as ks -CORRELATION_OUTPUT_COLUMN = '__correlation_output__' +CORRELATION_OUTPUT_COLUMN = "__correlation_output__" -def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame: +def corr(kdf: "ks.DataFrame", method: str = "pearson") -> pd.DataFrame: """ The correlation matrix of all the numerical columns of this dataframe. @@ -49,7 +49,7 @@ def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame: A 1.0 -1.0 B -1.0 1.0 """ - assert method in ('pearson', 'spearman') + assert method in ("pearson", "spearman") ndf, column_labels = to_numeric_df(kdf) corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method) pcorr = corr.toPandas() @@ -61,7 +61,7 @@ def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame: return pd.DataFrame(arr, columns=idx, index=idx) -def to_numeric_df(kdf: 'ks.DataFrame') -> Tuple[pyspark.sql.DataFrame, List[Tuple[str, ...]]]: +def to_numeric_df(kdf: "ks.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Tuple[str, ...]]]: """ Takes a dataframe and turns it into a dataframe containing a single numerical vector of doubles. This dataframe has a single field called '_1'. @@ -75,10 +75,13 @@ def to_numeric_df(kdf: 'ks.DataFrame') -> Tuple[pyspark.sql.DataFrame, List[Tupl (DataFrame[__correlation_output__: vector], [('A',), ('B',)]) """ # TODO, it should be more robust. - accepted_types = {np.dtype(dt) for dt in [np.int8, np.int16, np.int32, np.int64, - np.float32, np.float64, np.bool_]} - numeric_column_labels = [label for label in kdf._internal.column_labels - if kdf[label].dtype in accepted_types] + accepted_types = { + np.dtype(dt) + for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_] + } + numeric_column_labels = [ + label for label in kdf._internal.column_labels if kdf[label].dtype in accepted_types + ] numeric_df = kdf._sdf.select(*[kdf._internal.scol_for(idx) for idx in numeric_column_labels]) va = VectorAssembler(inputCols=numeric_df.columns, outputCol=CORRELATION_OUTPUT_COLUMN) v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN) diff --git a/databricks/koalas/mlflow.py b/databricks/koalas/mlflow.py index bea5acf..4ac6a07 100644 --- a/databricks/koalas/mlflow.py +++ b/databricks/koalas/mlflow.py @@ -37,6 +37,7 @@ class PythonModelWrapper(object): This wrapper acts as a predictor on koalas """ + def __init__(self, model_uri, return_type_hint): self._model_uri = model_uri # type: str self._return_type_hint = return_type_hint @@ -50,7 +51,7 @@ def _return_type(self) -> DataType: # return an integer or a categorical) # We can do the same for pytorch/tensorflow/keras models by looking at the output types. # However, this is probably better done in mlflow than here. - if hint == 'infer' or not hint: + if hint == "infer" or not hint: hint = np.float64 return as_spark_type(hint) @@ -90,12 +91,12 @@ def predict(self, data): # s = F.struct(*data.columns) # return_col = self._model_udf(s) column_labels = [(col,) for col in data._sdf.select(return_col).columns] - return Series(data._internal.copy(scol=return_col, - column_labels=column_labels), - anchor=data) + return Series( + data._internal.copy(scol=return_col, column_labels=column_labels), anchor=data + ) -def load_model(model_uri, predict_type='infer') -> PythonModelWrapper: +def load_model(model_uri, predict_type="infer") -> PythonModelWrapper: """ Loads an MLflow model into an wrapper that can be used both for pandas and Koalas DataFrame. diff --git a/databricks/koalas/namespace.py b/databricks/koalas/namespace.py index 7a057d2..b20aa27 100644 --- a/databricks/koalas/namespace.py +++ b/databricks/koalas/namespace.py @@ -29,8 +29,20 @@ from pandas.api.types import is_list_like from pyspark import sql as spark from pyspark.sql import functions as F -from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, \ - DoubleType, BooleanType, TimestampType, DecimalType, StringType, DateType, StructType +from pyspark.sql.types import ( + ByteType, + ShortType, + IntegerType, + LongType, + FloatType, + DoubleType, + BooleanType, + TimestampType, + DecimalType, + StringType, + DateType, + StructType, +) from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. from databricks.koalas.base import IndexOpsMixin @@ -41,13 +53,35 @@ from databricks.koalas.series import Series, _col -__all__ = ["from_pandas", "range", "read_csv", "read_delta", "read_table", "read_spark_io", - "read_parquet", "read_clipboard", "read_excel", "read_html", "to_datetime", - "get_dummies", "concat", "melt", "isna", "isnull", "notna", "notnull", - "read_sql_table", "read_sql_query", "read_sql", "read_json", "merge", "to_numeric"] - - -def from_pandas(pobj: Union['pd.DataFrame', 'pd.Series']) -> Union['Series', 'DataFrame']: +__all__ = [ + "from_pandas", + "range", + "read_csv", + "read_delta", + "read_table", + "read_spark_io", + "read_parquet", + "read_clipboard", + "read_excel", + "read_html", + "to_datetime", + "get_dummies", + "concat", + "melt", + "isna", + "isnull", + "notna", + "notnull", + "read_sql_table", + "read_sql_query", + "read_sql", + "read_json", + "merge", + "to_numeric", +] + + +def from_pandas(pobj: Union["pd.DataFrame", "pd.Series"]) -> Union["Series", "DataFrame"]: """Create a Koalas DataFrame or Series from a pandas DataFrame or Series. This is similar to Spark's `SparkSession.createDataFrame()` with pandas DataFrame, @@ -74,10 +108,9 @@ def from_pandas(pobj: Union['pd.DataFrame', 'pd.Series']) -> Union['Series', 'Da raise ValueError("Unknown data type: {}".format(type(pobj))) -def range(start: int, - end: Optional[int] = None, - step: int = 1, - num_partitions: Optional[int] = None) -> DataFrame: +def range( + start: int, end: Optional[int] = None, step: int = 1, num_partitions: Optional[int] = None +) -> DataFrame: """ Create a DataFrame with some range of numbers. @@ -128,9 +161,22 @@ def range(start: int, return DataFrame(sdf) -def read_csv(path, sep=',', header='infer', names=None, index_col=None, - usecols=None, squeeze=False, mangle_dupe_cols=True, dtype=None, - parse_dates=False, quotechar=None, escapechar=None, comment=None, **options): +def read_csv( + path, + sep=",", + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + mangle_dupe_cols=True, + dtype=None, + parse_dates=False, + quotechar=None, + escapechar=None, + comment=None, + **options +): """Read CSV (comma-separated) file into DataFrame. Parameters @@ -207,7 +253,7 @@ def read_csv(path, sep=',', header='infer', names=None, index_col=None, reader.option("inferSchema", True) reader.option("sep", sep) - if header == 'infer': + if header == "infer": header = 0 if names is None else None if header == 0: reader.option("header", True) @@ -231,18 +277,22 @@ def read_csv(path, sep=',', header='infer', names=None, index_col=None, else: sdf = reader.csv(path) if header is None: - sdf = sdf.selectExpr(*["`%s` as `%s`" % (field.name, i) - for i, field in enumerate(sdf.schema)]) + sdf = sdf.selectExpr( + *["`%s` as `%s`" % (field.name, i) for i, field in enumerate(sdf.schema)] + ) if isinstance(names, list): names = list(names) if len(set(names)) != len(names): - raise ValueError('Found non-unique column index') + raise ValueError("Found non-unique column index") if len(names) != len(sdf.schema): - raise ValueError('The number of names [%s] does not match the number ' - 'of columns [%d]. Try names by a Spark SQL DDL-formatted ' - 'string.' % (len(sdf.schema), len(names))) - sdf = sdf.selectExpr(*["`%s` as `%s`" % (field.name, name) - for field, name in zip(sdf.schema, names)]) + raise ValueError( + "The number of names [%s] does not match the number " + "of columns [%d]. Try names by a Spark SQL DDL-formatted " + "string." % (len(sdf.schema), len(names)) + ) + sdf = sdf.selectExpr( + *["`%s` as `%s`" % (field.name, name) for field, name in zip(sdf.schema, names)] + ) if usecols is not None: if callable(usecols): @@ -250,17 +300,23 @@ def read_csv(path, sep=',', header='infer', names=None, index_col=None, missing = [] elif all(isinstance(col, int) for col in usecols): cols = [field.name for i, field in enumerate(sdf.schema) if i in usecols] - missing = [col for col in usecols - if col >= len(sdf.schema) or sdf.schema[col].name not in cols] + missing = [ + col + for col in usecols + if col >= len(sdf.schema) or sdf.schema[col].name not in cols + ] elif all(isinstance(col, str) for col in usecols): cols = [field.name for field in sdf.schema if field.name in usecols] missing = [col for col in usecols if col not in cols] else: - raise ValueError("'usecols' must either be list-like of all strings, " - "all unicode, all integers or a callable.") + raise ValueError( + "'usecols' must either be list-like of all strings, " + "all unicode, all integers or a callable." + ) if len(missing) > 0: - raise ValueError('Usecols do not match columns, columns expected but not ' - 'found: %s' % missing) + raise ValueError( + "Usecols do not match columns, columns expected but not " "found: %s" % missing + ) if len(cols) > 0: sdf = sdf.select(cols) @@ -319,11 +375,16 @@ def read_json(path: str, index_col: Optional[Union[str, List[str]]] = None, **op 0 a b 1 c d """ - return read_spark_io(path, format='json', index_col=index_col, **options) + return read_spark_io(path, format="json", index_col=index_col, **options) -def read_delta(path: str, version: Optional[str] = None, timestamp: Optional[str] = None, - index_col: Optional[Union[str, List[str]]] = None, **options) -> DataFrame: +def read_delta( + path: str, + version: Optional[str] = None, + timestamp: Optional[str] = None, + index_col: Optional[Union[str, List[str]]] = None, + **options +) -> DataFrame: """ Read a Delta Lake table on some file system and return a DataFrame. @@ -377,10 +438,10 @@ def read_delta(path: str, version: Optional[str] = None, timestamp: Optional[str 0 0 """ if version is not None: - options['versionAsOf'] = version + options["versionAsOf"] = version if timestamp is not None: - options['timestampAsOf'] = timestamp - return read_spark_io(path, format='delta', index_col=index_col, **options) + options["timestampAsOf"] = timestamp + return read_spark_io(path, format="delta", index_col=index_col, **options) def read_table(name: str, index_col: Optional[Union[str, List[str]]] = None) -> DataFrame: @@ -419,10 +480,13 @@ def read_table(name: str, index_col: Optional[Union[str, List[str]]] = None) -> return DataFrame(_InternalFrame(sdf=sdf, index_map=index_map)) -def read_spark_io(path: Optional[str] = None, format: Optional[str] = None, - schema: Union[str, 'StructType'] = None, - index_col: Optional[Union[str, List[str]]] = None, - **options) -> DataFrame: +def read_spark_io( + path: Optional[str] = None, + format: Optional[str] = None, + schema: Union[str, "StructType"] = None, + index_col: Optional[Union[str, List[str]]] = None, + **options +) -> DataFrame: """Load a DataFrame from a Spark data source. Parameters @@ -527,7 +591,7 @@ def read_parquet(path, columns=None, index_col=None) -> DataFrame: return DataFrame(_InternalFrame(sdf=sdf, index_map=index_map)) -def read_clipboard(sep=r'\s+', **kwargs): +def read_clipboard(sep=r"\s+", **kwargs): r""" Read text from clipboard and pass to read_csv. See read_csv for the full argument list @@ -549,11 +613,33 @@ def read_clipboard(sep=r'\s+', **kwargs): return from_pandas(pd.read_clipboard(sep, **kwargs)) -def read_excel(io, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, - dtype=None, engine=None, converters=None, true_values=None, false_values=None, - skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, - parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, - convert_float=True, mangle_dupe_cols=True, **kwds): +def read_excel( + io, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + keep_default_na=True, + verbose=False, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds +): """ Read an Excel file into a Koalas DataFrame. @@ -751,24 +837,55 @@ def read_excel(io, sheet_name=0, header=0, names=None, index_col=None, usecols=N 2 None NaN """ pdfs = pd.read_excel( - io=io, sheet_name=sheet_name, header=header, names=names, index_col=index_col, - usecols=usecols, squeeze=squeeze, dtype=dtype, engine=engine, converters=converters, - true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, - na_values=na_values, keep_default_na=keep_default_na, verbose=verbose, - parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, - skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, - **kwds) + io=io, + sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + dtype=dtype, + engine=engine, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + keep_default_na=keep_default_na, + verbose=verbose, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + **kwds + ) if isinstance(pdfs, dict): return OrderedDict([(key, from_pandas(value)) for key, value in pdfs.items()]) else: return from_pandas(pdfs) -def read_html(io, match='.+', flavor=None, header=None, index_col=None, - skiprows=None, attrs=None, parse_dates=False, - thousands=',', encoding=None, - decimal='.', converters=None, na_values=None, - keep_default_na=True, displayed_only=True): +def read_html( + io, + match=".+", + flavor=None, + header=None, + index_col=None, + skiprows=None, + attrs=None, + parse_dates=False, + thousands=",", + encoding=None, + decimal=".", + converters=None, + na_values=None, + keep_default_na=True, + displayed_only=True, +): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -869,10 +986,22 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, DataFrame.to_html """ pdfs = pd.read_html( - io=io, match=match, flavor=flavor, header=header, index_col=index_col, skiprows=skiprows, - attrs=attrs, parse_dates=parse_dates, thousands=thousands, encoding=encoding, - decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na, displayed_only=displayed_only) + io=io, + match=match, + flavor=flavor, + header=header, + index_col=index_col, + skiprows=skiprows, + attrs=attrs, + parse_dates=parse_dates, + thousands=thousands, + encoding=encoding, + decimal=decimal, + converters=converters, + na_values=na_values, + keep_default_na=keep_default_na, + displayed_only=displayed_only, + ) return [from_pandas(pdf) for pdf in pdfs] @@ -918,8 +1047,8 @@ def read_sql_table(table_name, con, schema=None, index_col=None, columns=None, * >>> ks.read_sql_table('table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP """ reader = default_session().read - reader.option('dbtable', table_name) - reader.option('url', con) + reader.option("dbtable", table_name) + reader.option("url", con) if schema is not None: reader.schema(schema) reader.options(**options) @@ -971,8 +1100,8 @@ def read_sql_query(sql, con, index_col=None, **options): >>> ks.read_sql_query('SELECT * FROM table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP """ reader = default_session().read - reader.option('query', sql) - reader.option('url', con) + reader.option("query", sql) + reader.option("url", con) reader.options(**options) sdf = reader.format("jdbc").load() index_map = _get_index_map(sdf, index_col) @@ -1025,14 +1154,15 @@ def read_sql(sql, con, index_col=None, columns=None, **options): >>> ks.read_sql('SELECT * FROM table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP """ striped = sql.strip() - if ' ' not in striped: # TODO: identify the table name or not more precisely. + if " " not in striped: # TODO: identify the table name or not more precisely. return read_sql_table(sql, con, index_col=index_col, columns=columns, **options) else: return read_sql_query(sql, con, index_col=index_col, **options) -def to_datetime(arg, errors='raise', format=None, unit=None, infer_datetime_format=False, - origin='unix'): +def to_datetime( + arg, errors="raise", format=None, unit=None, infer_datetime_format=False, origin="unix" +): """ Convert argument to datetime. @@ -1153,34 +1283,50 @@ def to_datetime(arg, errors='raise', format=None, unit=None, infer_datetime_form format=format, unit=unit, infer_datetime_format=infer_datetime_format, - origin=origin) + origin=origin, + ) if isinstance(arg, DataFrame): return _to_datetime2( - arg_year=arg['year'], - arg_month=arg['month'], - arg_day=arg['day'], + arg_year=arg["year"], + arg_month=arg["month"], + arg_day=arg["day"], errors=errors, format=format, unit=unit, infer_datetime_format=infer_datetime_format, - origin=origin) + origin=origin, + ) if isinstance(arg, dict): return _to_datetime2( - arg_year=arg['year'], - arg_month=arg['month'], - arg_day=arg['day'], + arg_year=arg["year"], + arg_month=arg["month"], + arg_day=arg["day"], errors=errors, format=format, unit=unit, infer_datetime_format=infer_datetime_format, - origin=origin) + origin=origin, + ) return pd.to_datetime( - arg, errors=errors, format=format, unit=unit, infer_datetime_format=infer_datetime_format, - origin=origin) - - -def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, - drop_first=False, dtype=None): + arg, + errors=errors, + format=format, + unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin, + ) + + +def get_dummies( + data, + prefix=None, + prefix_sep="_", + dummy_na=False, + columns=None, + sparse=False, + drop_first=False, + dtype=None, +): """ Convert categorical variable into dummy/indicator variables, also known as one hot encoding. @@ -1271,7 +1417,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, raise TypeError("Input must be a list-like for parameter `columns`") if dtype is None: - dtype = 'byte' + dtype = "byte" if isinstance(data, Series): if prefix is not None: @@ -1282,34 +1428,49 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, else: if isinstance(prefix, str): raise NotImplementedError( - "get_dummies currently does not support prefix as string types") + "get_dummies currently does not support prefix as string types" + ) kdf = data.copy() if columns is None: - column_labels = [label for label in kdf._internal.column_labels - if isinstance(kdf._internal.spark_type_for(label), - _get_dummies_default_accept_types)] + column_labels = [ + label + for label in kdf._internal.column_labels + if isinstance( + kdf._internal.spark_type_for(label), _get_dummies_default_accept_types + ) + ] else: if isinstance(columns, (str, tuple)): if isinstance(columns, str): key = (columns,) else: key = columns - column_labels = [label for label in kdf._internal.column_labels - if label[:len(key)] == key] + column_labels = [ + label for label in kdf._internal.column_labels if label[: len(key)] == key + ] if len(column_labels) == 0: raise KeyError(column_labels) if prefix is None: - prefix = [str(label[len(key):]) if len(label) > len(key) + 1 - else label[len(key)] if len(label) == len(key) + 1 else '' - for label in column_labels] - elif (any(isinstance(col, str) for col in columns) - and any(isinstance(col, tuple) for col in columns)): - raise ValueError('Expected tuple, got str') + prefix = [ + str(label[len(key) :]) + if len(label) > len(key) + 1 + else label[len(key)] + if len(label) == len(key) + 1 + else "" + for label in column_labels + ] + elif any(isinstance(col, str) for col in columns) and any( + isinstance(col, tuple) for col in columns + ): + raise ValueError("Expected tuple, got str") else: - column_labels = [label for key in columns - for label in kdf._internal.column_labels - if label == key or label[0] == key] + column_labels = [ + label + for key in columns + for label in kdf._internal.column_labels + if label == key or label[0] == key + ] if len(column_labels) == 0: if columns is None: return kdf @@ -1319,47 +1480,56 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, prefix = [str(label) if len(label) > 1 else label[0] for label in column_labels] column_labels_set = set(column_labels) - remaining_columns = [kdf[label].rename(name_like_string(label)) - for label in kdf._internal.column_labels - if label not in column_labels_set] - - if any(not isinstance(kdf._internal.spark_type_for(label), _get_dummies_acceptable_types) - for label in column_labels): + remaining_columns = [ + kdf[label].rename(name_like_string(label)) + for label in kdf._internal.column_labels + if label not in column_labels_set + ] + + if any( + not isinstance(kdf._internal.spark_type_for(label), _get_dummies_acceptable_types) + for label in column_labels + ): raise NotImplementedError( - "get_dummies currently only accept {} values" - .format(', '.join([t.typeName() for t in _get_dummies_acceptable_types]))) + "get_dummies currently only accept {} values".format( + ", ".join([t.typeName() for t in _get_dummies_acceptable_types]) + ) + ) if prefix is not None and len(column_labels) != len(prefix): raise ValueError( - "Length of 'prefix' ({}) did not match the length of the columns being encoded ({})." - .format(len(prefix), len(column_labels))) + "Length of 'prefix' ({}) did not match the length of " + "the columns being encoded ({}).".format(len(prefix), len(column_labels)) + ) - all_values = _reduce_spark_multi(kdf._sdf, - [F.collect_set(kdf._internal.scol_for(label)) - for label in column_labels]) + all_values = _reduce_spark_multi( + kdf._sdf, [F.collect_set(kdf._internal.scol_for(label)) for label in column_labels] + ) for i, label in enumerate(column_labels): values = sorted(all_values[i]) if drop_first: values = values[1:] def column_name(value): - if prefix is None or prefix[i] == '': + if prefix is None or prefix[i] == "": return str(value) else: - return '{}{}{}'.format(prefix[i], prefix_sep, value) + return "{}{}{}".format(prefix[i], prefix_sep, value) for value in values: - remaining_columns.append((kdf[label].notnull() & (kdf[label] == value)) - .astype(dtype) - .rename(column_name(value))) + remaining_columns.append( + (kdf[label].notnull() & (kdf[label] == value)) + .astype(dtype) + .rename(column_name(value)) + ) if dummy_na: - remaining_columns.append(kdf[label].isnull().astype(dtype).rename(column_name('nan'))) + remaining_columns.append(kdf[label].isnull().astype(dtype).rename(column_name("nan"))) return kdf[remaining_columns] # TODO: there are many parameters to implement and support. See Pandas's pd.concat. -def concat(objs, axis=0, join='outer', ignore_index=False): +def concat(objs, axis=0, join="outer", ignore_index=False): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -1476,26 +1646,33 @@ def concat(objs, axis=0, join='outer', ignore_index=False): 0 c 3 1 d 4 """ - if isinstance(objs, (DataFrame, IndexOpsMixin)) or \ - not isinstance(objs, Iterable): # TODO: support dict - raise TypeError('first argument must be an iterable of koalas ' - 'objects, you passed an object of type ' - '"{name}"'.format(name=type(objs).__name__)) + if isinstance(objs, (DataFrame, IndexOpsMixin)) or not isinstance( + objs, Iterable + ): # TODO: support dict + raise TypeError( + "first argument must be an iterable of koalas " + "objects, you passed an object of type " + '"{name}"'.format(name=type(objs).__name__) + ) axis = validate_axis(axis) if axis != 0: raise NotImplementedError('axis should be either 0 or "index" currently.') if len(objs) == 0: - raise ValueError('No objects to concatenate') + raise ValueError("No objects to concatenate") objs = list(filter(lambda obj: obj is not None, objs)) if len(objs) == 0: - raise ValueError('All objects passed were None') + raise ValueError("All objects passed were None") for obj in objs: if not isinstance(obj, (Series, DataFrame)): - raise TypeError('cannot concatenate object of type '"'{name}"'; only ks.Series ' - 'and ks.DataFrame are valid'.format(name=type(objs).__name__)) + raise TypeError( + "cannot concatenate object of type " + "'{name}" + "; only ks.Series " + "and ks.DataFrame are valid".format(name=type(objs).__name__) + ) # Series, Series ... # We should return Series if objects are all Series. @@ -1506,13 +1683,13 @@ def concat(objs, axis=0, join='outer', ignore_index=False): new_objs = [] for obj in objs: if isinstance(obj, Series): - obj = obj.rename('0').to_dataframe() + obj = obj.rename("0").to_dataframe() new_objs.append(obj) objs = new_objs column_labels_levels = set(obj._internal.column_labels_level for obj in objs) if len(column_labels_levels) != 1: - raise ValueError('MultiIndex columns should have the same levels') + raise ValueError("MultiIndex columns should have the same levels") # DataFrame, DataFrame, ... # All Series are converted into DataFrame and then compute concat. @@ -1522,19 +1699,21 @@ def concat(objs, axis=0, join='outer', ignore_index=False): for index_of_kdf in indices_of_kdfs: if index_of_first_kdf.names != index_of_kdf.names: raise ValueError( - 'Index type and names should be same in the objects to concatenate. ' - 'You passed different indices ' - '{index_of_first_kdf} and {index_of_kdf}'.format( - index_of_first_kdf=index_of_first_kdf.names, - index_of_kdf=index_of_kdf.names)) + "Index type and names should be same in the objects to concatenate. " + "You passed different indices " + "{index_of_first_kdf} and {index_of_kdf}".format( + index_of_first_kdf=index_of_first_kdf.names, index_of_kdf=index_of_kdf.names + ) + ) column_labelses_of_kdfs = [kdf._internal.column_labels for kdf in objs] if ignore_index: index_names_of_kdfs = [[] for _ in objs] else: index_names_of_kdfs = [kdf._internal.index_names for kdf in objs] - if (all(name == index_names_of_kdfs[0] for name in index_names_of_kdfs) - and all(idx == column_labelses_of_kdfs[0] for idx in column_labelses_of_kdfs)): + if all(name == index_names_of_kdfs[0] for name in index_names_of_kdfs) and all( + idx == column_labelses_of_kdfs[0] for idx in column_labelses_of_kdfs + ): # If all columns are in the same order and values, use it. kdfs = objs merged_columns = column_labelses_of_kdfs[0] @@ -1542,15 +1721,21 @@ def concat(objs, axis=0, join='outer', ignore_index=False): if join == "inner": interested_columns = set.intersection(*map(set, column_labelses_of_kdfs)) # Keep the column order with its firsts DataFrame. - merged_columns = sorted(list(map( - lambda c: column_labelses_of_kdfs[0][column_labelses_of_kdfs[0].index(c)], - interested_columns))) + merged_columns = sorted( + list( + map( + lambda c: column_labelses_of_kdfs[0][column_labelses_of_kdfs[0].index(c)], + interested_columns, + ) + ) + ) kdfs = [kdf[merged_columns] for kdf in objs] elif join == "outer": # If there are columns unmatched, just sort the column names. - merged_columns = \ - sorted(list(set(itertools.chain.from_iterable(column_labelses_of_kdfs)))) + merged_columns = sorted( + list(set(itertools.chain.from_iterable(column_labelses_of_kdfs))) + ) kdfs = [] for kdf in objs: @@ -1561,29 +1746,37 @@ def concat(objs, axis=0, join='outer', ignore_index=False): for label in columns_to_add: sdf = sdf.withColumn(name_like_string(label), F.lit(None)) - data_columns = (kdf._internal.data_columns - + [name_like_string(label) for label in columns_to_add]) - kdf = DataFrame(kdf._internal.copy( - sdf=sdf, - column_labels=(kdf._internal.column_labels + columns_to_add), - column_scols=[scol_for(sdf, col) for col in data_columns])) + data_columns = kdf._internal.data_columns + [ + name_like_string(label) for label in columns_to_add + ] + kdf = DataFrame( + kdf._internal.copy( + sdf=sdf, + column_labels=(kdf._internal.column_labels + columns_to_add), + column_scols=[scol_for(sdf, col) for col in data_columns], + ) + ) kdfs.append(kdf[merged_columns]) else: - raise ValueError( - "Only can inner (intersect) or outer (union) join the other axis.") + raise ValueError("Only can inner (intersect) or outer (union) join the other axis.") if ignore_index: sdfs = [kdf._sdf.select(kdf._internal.column_scols) for kdf in kdfs] else: - sdfs = [kdf._sdf.select(kdf._internal.index_scols + kdf._internal.column_scols) - for kdf in kdfs] + sdfs = [ + kdf._sdf.select(kdf._internal.index_scols + kdf._internal.column_scols) for kdf in kdfs + ] concatenated = reduce(lambda x, y: x.union(y), sdfs) index_map = None if ignore_index else kdfs[0]._internal.index_map - result_kdf = DataFrame(kdfs[0]._internal.copy( - sdf=concatenated, index_map=index_map, - column_scols=[scol_for(concatenated, col) for col in kdfs[0]._internal.data_columns])) + result_kdf = DataFrame( + kdfs[0]._internal.copy( + sdf=concatenated, + index_map=index_map, + column_scols=[scol_for(concatenated, col) for col in kdfs[0]._internal.data_columns], + ) + ) if should_return_series: # If all input were Series, we should return Series. @@ -1592,8 +1785,7 @@ def concat(objs, axis=0, join='outer', ignore_index=False): return result_kdf -def melt(frame, id_vars=None, value_vars=None, var_name=None, - value_name='value'): +def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name="value"): return DataFrame.melt(frame, id_vars, value_vars, var_name, value_name) @@ -1752,12 +1944,17 @@ def notna(obj): notnull = notna -def merge(obj, right: 'DataFrame', how: str = 'inner', - on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, - left_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, - right_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, - left_index: bool = False, right_index: bool = False, - suffixes: Tuple[str, str] = ('_x', '_y')) -> 'DataFrame': +def merge( + obj, + right: "DataFrame", + how: str = "inner", + on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, + left_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, + right_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, + left_index: bool = False, + right_index: bool = False, + suffixes: Tuple[str, str] = ("_x", "_y"), +) -> "DataFrame": """ Merge DataFrame objects with a database-style join. @@ -1869,8 +2066,15 @@ def merge(obj, right: 'DataFrame', how: str = 'inner', instead of NaN. """ return obj.merge( - right, how=how, on=on, left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index, suffixes=suffixes) + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + suffixes=suffixes, + ) def to_numeric(arg): @@ -1940,28 +2144,31 @@ def to_numeric(arg): 1.0 """ if isinstance(arg, Series): - return arg._with_new_scol(arg._internal.scol.cast('float')) + return arg._with_new_scol(arg._internal.scol.cast("float")) else: return pd.to_numeric(arg) # @pandas_wraps(return_col=np.datetime64) @pandas_wraps -def _to_datetime1(arg, errors, format, unit, infer_datetime_format, - origin) -> Series[np.datetime64]: +def _to_datetime1( + arg, errors, format, unit, infer_datetime_format, origin +) -> Series[np.datetime64]: return pd.to_datetime( arg, errors=errors, format=format, unit=unit, infer_datetime_format=infer_datetime_format, - origin=origin) + origin=origin, + ) # @pandas_wraps(return_col=np.datetime64) @pandas_wraps -def _to_datetime2(arg_year, arg_month, arg_day, - errors, format, unit, infer_datetime_format, origin) -> Series[np.datetime64]: +def _to_datetime2( + arg_year, arg_month, arg_day, errors, format, unit, infer_datetime_format, origin +) -> Series[np.datetime64]: arg = dict(year=arg_year, month=arg_month, day=arg_day) for key in arg: if arg[key] is None: @@ -1972,11 +2179,11 @@ def _to_datetime2(arg_year, arg_month, arg_day, format=format, unit=unit, infer_datetime_format=infer_datetime_format, - origin=origin) + origin=origin, + ) -def _get_index_map(sdf: spark.DataFrame, - index_col: Optional[Union[str, List[str]]] = None): +def _get_index_map(sdf: spark.DataFrame, index_col: Optional[Union[str, List[str]]] = None): if index_col is not None: if isinstance(index_col, str): index_col = [index_col] @@ -1991,9 +2198,14 @@ def _get_index_map(sdf: spark.DataFrame, return index_map -_get_dummies_default_accept_types = ( - DecimalType, StringType, DateType -) +_get_dummies_default_accept_types = (DecimalType, StringType, DateType) _get_dummies_acceptable_types = _get_dummies_default_accept_types + ( - ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, BooleanType, TimestampType + ByteType, + ShortType, + IntegerType, + LongType, + FloatType, + DoubleType, + BooleanType, + TimestampType, ) diff --git a/databricks/koalas/numpy_compat.py b/databricks/koalas/numpy_compat.py index 8591b3c..3e9bb12 100644 --- a/databricks/koalas/numpy_compat.py +++ b/databricks/koalas/numpy_compat.py @@ -21,92 +21,97 @@ from pyspark.sql.types import DoubleType, LongType, BooleanType -unary_np_spark_mappings = OrderedDict({ - 'abs': F.abs, - 'absolute': F.abs, - 'arccos': F.acos, - 'arccosh': F.pandas_udf(lambda s: np.arccosh(s), DoubleType()), - 'arcsin': F.asin, - 'arcsinh': F.pandas_udf(lambda s: np.arcsinh(s), DoubleType()), - 'arctan': F.atan, - 'arctanh': F.pandas_udf(lambda s: np.arctanh(s), DoubleType()), - 'bitwise_not': F.bitwiseNOT, - 'cbrt': F.cbrt, - 'ceil': F.ceil, - 'conj': lambda _: NotImplemented, # It requires complex type which Koalas does not support yet - 'conjugate': lambda _: NotImplemented, # It requires complex type - 'cos': F.cos, - 'cosh': F.pandas_udf(lambda s: np.cosh(s), DoubleType()), - 'deg2rad': F.pandas_udf(lambda s: np.deg2rad(s), DoubleType()), - 'degrees': F.degrees, - 'exp': F.exp, - 'exp2': F.pandas_udf(lambda s: np.exp2(s), DoubleType()), - 'expm1': F.expm1, - 'fabs': F.pandas_udf(lambda s: np.fabs(s), DoubleType()), - 'floor': F.floor, - 'frexp': lambda _: NotImplemented, # 'frexp' output lengths become different - # and it cannot be supported via pandas UDF. - 'invert': F.pandas_udf(lambda s: np.invert(s), DoubleType()), - 'isfinite': lambda c: c != float("inf"), - 'isinf': lambda c: c == float("inf"), - 'isnan': F.isnan, - 'isnat': lambda c: NotImplemented, # Koalas and PySpark does not have Nat concept. - 'log': F.log, - 'log10': F.log10, - 'log1p': F.log1p, - 'log2': F.pandas_udf(lambda s: np.log2(s), DoubleType()), - 'logical_not': lambda c: ~(c.cast(BooleanType())), - 'matmul': lambda _: NotImplemented, # Can return a NumPy array in pandas. - 'negative': lambda c: c * -1, - 'positive': lambda c: c, - 'rad2deg': F.pandas_udf(lambda s: np.rad2deg(s), DoubleType()), - 'radians': F.radians, - 'reciprocal': F.pandas_udf(lambda s: np.reciprocal(s), DoubleType()), - 'rint': F.pandas_udf(lambda s: np.rint(s), DoubleType()), - 'sign': lambda c: F.when(c == 0, 0).when(c < 0, -1).otherwise(1), - 'signbit': lambda c: F.when(c < 0, True).otherwise(False), - 'sin': F.sin, - 'sinh': F.pandas_udf(lambda s: np.sinh(s), DoubleType()), - 'spacing': F.pandas_udf(lambda s: np.spacing(s), DoubleType()), - 'sqrt': F.sqrt, - 'square': F.pandas_udf(lambda s: np.square(s), DoubleType()), - 'tan': F.tan, - 'tanh': F.pandas_udf(lambda s: np.tanh(s), DoubleType()), - 'trunc': F.pandas_udf(lambda s: np.trunc(s), DoubleType()), -}) - -binary_np_spark_mappings = OrderedDict({ - 'arctan2': F.atan2, - 'bitwise_and': lambda c1, c2: c1.bitwiseAND(c2), - 'bitwise_or': lambda c1, c2: c1.bitwiseOR(c2), - 'bitwise_xor': lambda c1, c2: c1.bitwiseXOR(c2), - 'copysign': F.pandas_udf(lambda s1, s2: np.copysign(s1, s2), DoubleType()), - 'float_power': F.pandas_udf(lambda s1, s2: np.float_power(s1, s2), DoubleType()), - 'floor_divide': F.pandas_udf(lambda s1, s2: np.floor_divide(s1, s2), DoubleType()), - 'fmax': F.pandas_udf(lambda s1, s2: np.fmax(s1, s2), DoubleType()), - 'fmin': F.pandas_udf(lambda s1, s2: np.fmin(s1, s2), DoubleType()), - 'fmod': F.pandas_udf(lambda s1, s2: np.fmod(s1, s2), DoubleType()), - 'gcd': F.pandas_udf(lambda s1, s2: np.gcd(s1, s2), DoubleType()), - 'heaviside': F.pandas_udf(lambda s1, s2: np.heaviside(s1, s2), DoubleType()), - 'hypot': F.hypot, - 'lcm': F.pandas_udf(lambda s1, s2: np.lcm(s1, s2), DoubleType()), - 'ldexp': F.pandas_udf(lambda s1, s2: np.ldexp(s1, s2), DoubleType()), - 'left_shift': F.pandas_udf(lambda s1, s2: np.left_shift(s1, s2), LongType()), - 'logaddexp': F.pandas_udf(lambda s1, s2: np.logaddexp(s1, s2), DoubleType()), - 'logaddexp2': F.pandas_udf(lambda s1, s2: np.logaddexp2(s1, s2), DoubleType()), - 'logical_and': lambda c1, c2: c1.cast(BooleanType()) & c2.cast(BooleanType()), - 'logical_or': lambda c1, c2: c1.cast(BooleanType()) | c2.cast(BooleanType()), - 'logical_xor': lambda c1, c2: ( - # mimics xor by logical operators. - (c1.cast(BooleanType()) | c2.cast(BooleanType())) - & (~(c1.cast(BooleanType())) | ~(c2.cast(BooleanType()))) - ), - 'maximum': F.greatest, - 'minimum': F.least, - 'modf': F.pandas_udf(lambda s1, s2: np.modf(s1, s2), DoubleType()), - 'nextafter': F.pandas_udf(lambda s1, s2: np.nextafter(s1, s2), DoubleType()), - 'right_shift': F.pandas_udf(lambda s1, s2: np.right_shift(s1, s2), LongType()), -}) +unary_np_spark_mappings = OrderedDict( + { + "abs": F.abs, + "absolute": F.abs, + "arccos": F.acos, + "arccosh": F.pandas_udf(lambda s: np.arccosh(s), DoubleType()), + "arcsin": F.asin, + "arcsinh": F.pandas_udf(lambda s: np.arcsinh(s), DoubleType()), + "arctan": F.atan, + "arctanh": F.pandas_udf(lambda s: np.arctanh(s), DoubleType()), + "bitwise_not": F.bitwiseNOT, + "cbrt": F.cbrt, + "ceil": F.ceil, + # It requires complex type which Koalas does not support yet + "conj": lambda _: NotImplemented, + "conjugate": lambda _: NotImplemented, # It requires complex type + "cos": F.cos, + "cosh": F.pandas_udf(lambda s: np.cosh(s), DoubleType()), + "deg2rad": F.pandas_udf(lambda s: np.deg2rad(s), DoubleType()), + "degrees": F.degrees, + "exp": F.exp, + "exp2": F.pandas_udf(lambda s: np.exp2(s), DoubleType()), + "expm1": F.expm1, + "fabs": F.pandas_udf(lambda s: np.fabs(s), DoubleType()), + "floor": F.floor, + "frexp": lambda _: NotImplemented, # 'frexp' output lengths become different + # and it cannot be supported via pandas UDF. + "invert": F.pandas_udf(lambda s: np.invert(s), DoubleType()), + "isfinite": lambda c: c != float("inf"), + "isinf": lambda c: c == float("inf"), + "isnan": F.isnan, + "isnat": lambda c: NotImplemented, # Koalas and PySpark does not have Nat concept. + "log": F.log, + "log10": F.log10, + "log1p": F.log1p, + "log2": F.pandas_udf(lambda s: np.log2(s), DoubleType()), + "logical_not": lambda c: ~(c.cast(BooleanType())), + "matmul": lambda _: NotImplemented, # Can return a NumPy array in pandas. + "negative": lambda c: c * -1, + "positive": lambda c: c, + "rad2deg": F.pandas_udf(lambda s: np.rad2deg(s), DoubleType()), + "radians": F.radians, + "reciprocal": F.pandas_udf(lambda s: np.reciprocal(s), DoubleType()), + "rint": F.pandas_udf(lambda s: np.rint(s), DoubleType()), + "sign": lambda c: F.when(c == 0, 0).when(c < 0, -1).otherwise(1), + "signbit": lambda c: F.when(c < 0, True).otherwise(False), + "sin": F.sin, + "sinh": F.pandas_udf(lambda s: np.sinh(s), DoubleType()), + "spacing": F.pandas_udf(lambda s: np.spacing(s), DoubleType()), + "sqrt": F.sqrt, + "square": F.pandas_udf(lambda s: np.square(s), DoubleType()), + "tan": F.tan, + "tanh": F.pandas_udf(lambda s: np.tanh(s), DoubleType()), + "trunc": F.pandas_udf(lambda s: np.trunc(s), DoubleType()), + } +) + +binary_np_spark_mappings = OrderedDict( + { + "arctan2": F.atan2, + "bitwise_and": lambda c1, c2: c1.bitwiseAND(c2), + "bitwise_or": lambda c1, c2: c1.bitwiseOR(c2), + "bitwise_xor": lambda c1, c2: c1.bitwiseXOR(c2), + "copysign": F.pandas_udf(lambda s1, s2: np.copysign(s1, s2), DoubleType()), + "float_power": F.pandas_udf(lambda s1, s2: np.float_power(s1, s2), DoubleType()), + "floor_divide": F.pandas_udf(lambda s1, s2: np.floor_divide(s1, s2), DoubleType()), + "fmax": F.pandas_udf(lambda s1, s2: np.fmax(s1, s2), DoubleType()), + "fmin": F.pandas_udf(lambda s1, s2: np.fmin(s1, s2), DoubleType()), + "fmod": F.pandas_udf(lambda s1, s2: np.fmod(s1, s2), DoubleType()), + "gcd": F.pandas_udf(lambda s1, s2: np.gcd(s1, s2), DoubleType()), + "heaviside": F.pandas_udf(lambda s1, s2: np.heaviside(s1, s2), DoubleType()), + "hypot": F.hypot, + "lcm": F.pandas_udf(lambda s1, s2: np.lcm(s1, s2), DoubleType()), + "ldexp": F.pandas_udf(lambda s1, s2: np.ldexp(s1, s2), DoubleType()), + "left_shift": F.pandas_udf(lambda s1, s2: np.left_shift(s1, s2), LongType()), + "logaddexp": F.pandas_udf(lambda s1, s2: np.logaddexp(s1, s2), DoubleType()), + "logaddexp2": F.pandas_udf(lambda s1, s2: np.logaddexp2(s1, s2), DoubleType()), + "logical_and": lambda c1, c2: c1.cast(BooleanType()) & c2.cast(BooleanType()), + "logical_or": lambda c1, c2: c1.cast(BooleanType()) | c2.cast(BooleanType()), + "logical_xor": lambda c1, c2: ( + # mimics xor by logical operators. + (c1.cast(BooleanType()) | c2.cast(BooleanType())) + & (~(c1.cast(BooleanType())) | ~(c2.cast(BooleanType()))) + ), + "maximum": F.greatest, + "minimum": F.least, + "modf": F.pandas_udf(lambda s1, s2: np.modf(s1, s2), DoubleType()), + "nextafter": F.pandas_udf(lambda s1, s2: np.nextafter(s1, s2), DoubleType()), + "right_shift": F.pandas_udf(lambda s1, s2: np.right_shift(s1, s2), LongType()), + } +) # Copied from pandas. @@ -184,17 +189,20 @@ def maybe_dispatch_ufunc_to_spark_func( op_name = ufunc.__name__ - if (method == "__call__" - and (op_name in unary_np_spark_mappings or op_name in binary_np_spark_mappings) - and kwargs.get("out") is None): + if ( + method == "__call__" + and (op_name in unary_np_spark_mappings or op_name in binary_np_spark_mappings) + and kwargs.get("out") is None + ): - np_spark_map_func = ( - unary_np_spark_mappings.get(op_name) - or binary_np_spark_mappings.get(op_name)) + np_spark_map_func = unary_np_spark_mappings.get(op_name) or binary_np_spark_mappings.get( + op_name + ) def convert_arguments(*args): args = [ # type: ignore - F.lit(inp) if not isinstance(inp, Column) else inp for inp in args] # type: ignore + F.lit(inp) if not isinstance(inp, Column) else inp for inp in args + ] # type: ignore return np_spark_map_func(*args) return _column_op(convert_arguments)(*inputs) # type: ignore diff --git a/databricks/koalas/plot.py b/databricks/koalas/plot.py index c4c3471..1a1bda7 100644 --- a/databricks/koalas/plot.py +++ b/databricks/koalas/plot.py @@ -31,20 +31,42 @@ from databricks.koalas.config import get_option -if LooseVersion(pd.__version__) < LooseVersion('0.25'): - from pandas.plotting._core import _all_kinds, BarPlot, BoxPlot, HistPlot, MPLPlot, PiePlot, \ - AreaPlot, LinePlot, BarhPlot, ScatterPlot, KdePlot +if LooseVersion(pd.__version__) < LooseVersion("0.25"): + from pandas.plotting._core import ( + _all_kinds, + BarPlot, + BoxPlot, + HistPlot, + MPLPlot, + PiePlot, + AreaPlot, + LinePlot, + BarhPlot, + ScatterPlot, + KdePlot, + ) else: from pandas.plotting._core import PlotAccessor - from pandas.plotting._matplotlib import BarPlot, BoxPlot, HistPlot, PiePlot, AreaPlot, \ - LinePlot, BarhPlot, ScatterPlot, KdePlot + from pandas.plotting._matplotlib import ( + BarPlot, + BoxPlot, + HistPlot, + PiePlot, + AreaPlot, + LinePlot, + BarhPlot, + ScatterPlot, + KdePlot, + ) from pandas.plotting._matplotlib.core import MPLPlot + _all_kinds = PlotAccessor._all_kinds class TopNPlot: def get_top_n(self, data): from databricks.koalas import DataFrame, Series + max_rows = get_option("plotting.max_rows") # Simply use the first 1k elements and make it into a pandas dataframe # For categorical variables, it is likely called from df.x.value_counts().plot.xxx(). @@ -64,18 +86,25 @@ def set_result_text(self, ax): assert hasattr(self, "partial") if self.partial: - ax.text(1, 1, 'showing top {} elements only'.format(max_rows), - size=6, ha='right', va='bottom', - transform=ax.transAxes) + ax.text( + 1, + 1, + "showing top {} elements only".format(max_rows), + size=6, + ha="right", + va="bottom", + transform=ax.transAxes, + ) class SampledPlot: def get_sampled(self, data): from databricks.koalas import DataFrame, Series + fraction = get_option("plotting.sample_ratio") if fraction is None: fraction = 1 / (len(data) / get_option("plotting.max_rows")) - fraction = min(1., fraction) + fraction = min(1.0, fraction) self.fraction = fraction if isinstance(data, (DataFrame, Series)): @@ -91,9 +120,14 @@ def set_result_text(self, ax): if self.fraction < 1: ax.text( - 1, 1, 'showing the sampled result by fraction %s' % self.fraction, - size=6, ha='right', va='bottom', - transform=ax.transAxes) + 1, + 1, + "showing the sampled result by fraction %s" % self.fraction, + size=6, + ha="right", + va="bottom", + transform=ax.transAxes, + ) class KoalasBarPlot(BarPlot, TopNPlot): @@ -106,43 +140,71 @@ def _plot(self, ax, x, y, w, start=0, log=False, **kwds): class KoalasBoxPlot(BoxPlot): - def boxplot(self, ax, bxpstats, notch=None, sym=None, vert=None, - whis=None, positions=None, widths=None, patch_artist=None, - bootstrap=None, usermedians=None, conf_intervals=None, - meanline=None, showmeans=None, showcaps=None, - showbox=None, showfliers=None, boxprops=None, - labels=None, flierprops=None, medianprops=None, - meanprops=None, capprops=None, whiskerprops=None, - manage_xticks=True, autorange=False, zorder=None, - precision=None): - + def boxplot( + self, + ax, + bxpstats, + notch=None, + sym=None, + vert=None, + whis=None, + positions=None, + widths=None, + patch_artist=None, + bootstrap=None, + usermedians=None, + conf_intervals=None, + meanline=None, + showmeans=None, + showcaps=None, + showbox=None, + showfliers=None, + boxprops=None, + labels=None, + flierprops=None, + medianprops=None, + meanprops=None, + capprops=None, + whiskerprops=None, + manage_xticks=True, + autorange=False, + zorder=None, + precision=None, + ): def _update_dict(dictionary, rc_name, properties): """ Loads properties in the dictionary from rc file if not already in the dictionary""" - rc_str = 'boxplot.{0}.{1}' + rc_str = "boxplot.{0}.{1}" if dictionary is None: dictionary = dict() for prop_dict in properties: - dictionary.setdefault(prop_dict, - matplotlib.rcParams[rc_str.format(rc_name, prop_dict)]) + dictionary.setdefault( + prop_dict, matplotlib.rcParams[rc_str.format(rc_name, prop_dict)] + ) return dictionary # Common property dictionaries loading from rc - flier_props = ['color', 'marker', 'markerfacecolor', 'markeredgecolor', - 'markersize', 'linestyle', 'linewidth'] - default_props = ['color', 'linewidth', 'linestyle'] - - boxprops = _update_dict(boxprops, 'boxprops', default_props) - whiskerprops = _update_dict(whiskerprops, 'whiskerprops', - default_props) - capprops = _update_dict(capprops, 'capprops', default_props) - medianprops = _update_dict(medianprops, 'medianprops', default_props) - meanprops = _update_dict(meanprops, 'meanprops', default_props) - flierprops = _update_dict(flierprops, 'flierprops', flier_props) + flier_props = [ + "color", + "marker", + "markerfacecolor", + "markeredgecolor", + "markersize", + "linestyle", + "linewidth", + ] + default_props = ["color", "linewidth", "linestyle"] + + boxprops = _update_dict(boxprops, "boxprops", default_props) + whiskerprops = _update_dict(whiskerprops, "whiskerprops", default_props) + capprops = _update_dict(capprops, "capprops", default_props) + medianprops = _update_dict(medianprops, "medianprops", default_props) + meanprops = _update_dict(meanprops, "meanprops", default_props) + flierprops = _update_dict(flierprops, "flierprops", flier_props) if patch_artist: - boxprops['linestyle'] = 'solid' - boxprops['edgecolor'] = boxprops.pop('color') + boxprops["linestyle"] = "solid" + boxprops["edgecolor"] = boxprops.pop("color") # if non-default sym value, put it into the flier dictionary # the logic for providing the default symbol ('b+') now lives @@ -153,9 +215,9 @@ def _update_dict(dictionary, rc_name, properties): # no-flier case, which should really be done with # 'showfliers=False' but none-the-less deal with it to keep back # compatibility - if sym == '': + if sym == "": # blow away existing dict and make one for invisible markers - flierprops = dict(linestyle='none', marker='', color='none') + flierprops = dict(linestyle="none", marker="", color="none") # turn the fliers off just to be safe showfliers = False # now process the symbol string @@ -165,60 +227,72 @@ def _update_dict(dictionary, rc_name, properties): _, marker, color = _process_plot_format(sym) # if we have a marker, use it if marker is not None: - flierprops['marker'] = marker + flierprops["marker"] = marker # if we have a color, use it if color is not None: # assume that if color is passed in the user want # filled symbol, if the users want more control use # flierprops - flierprops['color'] = color - flierprops['markerfacecolor'] = color - flierprops['markeredgecolor'] = color + flierprops["color"] = color + flierprops["markerfacecolor"] = color + flierprops["markeredgecolor"] = color # replace medians if necessary: if usermedians is not None: - if (len(np.ravel(usermedians)) != len(bxpstats) or - np.shape(usermedians)[0] != len(bxpstats)): - raise ValueError('usermedians length not compatible with x') + if len(np.ravel(usermedians)) != len(bxpstats) or np.shape(usermedians)[0] != len( + bxpstats + ): + raise ValueError("usermedians length not compatible with x") else: # reassign medians as necessary for stats, med in zip(bxpstats, usermedians): if med is not None: - stats['med'] = med + stats["med"] = med if conf_intervals is not None: if np.shape(conf_intervals)[0] != len(bxpstats): - err_mess = 'conf_intervals length not compatible with x' + err_mess = "conf_intervals length not compatible with x" raise ValueError(err_mess) else: for stats, ci in zip(bxpstats, conf_intervals): if ci is not None: if len(ci) != 2: - raise ValueError('each confidence interval must ' - 'have two values') + raise ValueError("each confidence interval must " "have two values") else: if ci[0] is not None: - stats['cilo'] = ci[0] + stats["cilo"] = ci[0] if ci[1] is not None: - stats['cihi'] = ci[1] - - artists = ax.bxp(bxpstats, positions=positions, widths=widths, - vert=vert, patch_artist=patch_artist, - shownotches=notch, showmeans=showmeans, - showcaps=showcaps, showbox=showbox, - boxprops=boxprops, flierprops=flierprops, - medianprops=medianprops, meanprops=meanprops, - meanline=meanline, showfliers=showfliers, - capprops=capprops, whiskerprops=whiskerprops, - manage_xticks=manage_xticks, zorder=zorder) + stats["cihi"] = ci[1] + + artists = ax.bxp( + bxpstats, + positions=positions, + widths=widths, + vert=vert, + patch_artist=patch_artist, + shownotches=notch, + showmeans=showmeans, + showcaps=showcaps, + showbox=showbox, + boxprops=boxprops, + flierprops=flierprops, + medianprops=medianprops, + meanprops=meanprops, + meanline=meanline, + showfliers=showfliers, + capprops=capprops, + whiskerprops=whiskerprops, + manage_xticks=manage_xticks, + zorder=zorder, + ) return artists - def _plot(self, ax, bxpstats, column_num=None, return_type='axes', **kwds): + def _plot(self, ax, bxpstats, column_num=None, return_type="axes", **kwds): bp = self.boxplot(ax, bxpstats, **kwds) - if return_type == 'dict': + if return_type == "dict": return bp, bp - elif return_type == 'both': + elif return_type == "both": return self.BP(ax=ax, lines=bp), bp else: return ax, bp @@ -231,12 +305,12 @@ def _compute_plot_data(self): self.kwds.update(KoalasBoxPlot.rc_defaults(**self.kwds)) # Gets some important kwds - showfliers = self.kwds.get('showfliers', False) - whis = self.kwds.get('whis', 1.5) - labels = self.kwds.get('labels', [colname]) + showfliers = self.kwds.get("showfliers", False) + whis = self.kwds.get("whis", 1.5) + labels = self.kwds.get("labels", [colname]) # This one is Koalas specific to control precision for approx_percentile - precision = self.kwds.get('precision', 0.01) + precision = self.kwds.get("precision", 0.01) # # Computes mean, median, Q1 and Q3 with approx_percentile and precision col_stats, col_fences = KoalasBoxPlot._compute_stats(data, colname, whis, precision) @@ -254,14 +328,16 @@ def _compute_plot_data(self): # Builds bxpstats dict stats = [] - item = {'mean': col_stats['mean'], - 'med': col_stats['med'], - 'q1': col_stats['q1'], - 'q3': col_stats['q3'], - 'whislo': whiskers[0], - 'whishi': whiskers[1], - 'fliers': fliers, - 'label': labels[0]} + item = { + "mean": col_stats["mean"], + "med": col_stats["med"], + "q1": col_stats["q1"], + "q3": col_stats["q3"], + "whislo": whiskers[0], + "whishi": whiskers[1], + "fliers": fliers, + "label": labels[0], + } stats.append(item) self.data = {labels[0]: stats} @@ -272,13 +348,19 @@ def _make_plot(self): kwds = self.kwds.copy() for stats in bxpstats: - if len(stats['fliers']) > 1000: - stats['fliers'] = stats['fliers'][:1000] - ax.text(1, 1, 'showing top 1,000 fliers only', size=6, ha='right', va='bottom', - transform=ax.transAxes) - - ret, bp = self._plot(ax, bxpstats, column_num=0, - return_type=self.return_type, **kwds) + if len(stats["fliers"]) > 1000: + stats["fliers"] = stats["fliers"][:1000] + ax.text( + 1, + 1, + "showing top 1,000 fliers only", + size=6, + ha="right", + va="bottom", + transform=ax.transAxes, + ) + + ret, bp = self._plot(ax, bxpstats, column_num=0, return_type=self.return_type, **kwds) self.maybe_color_bp(bp) self._return_obj = ret @@ -289,64 +371,87 @@ def _make_plot(self): self._set_ticklabels(ax, labels) @staticmethod - def rc_defaults(notch=None, vert=None, whis=None, - patch_artist=None, bootstrap=None, meanline=None, - showmeans=None, showcaps=None, showbox=None, - showfliers=None, **kwargs): + def rc_defaults( + notch=None, + vert=None, + whis=None, + patch_artist=None, + bootstrap=None, + meanline=None, + showmeans=None, + showcaps=None, + showbox=None, + showfliers=None, + **kwargs + ): # Missing arguments default to rcParams. if whis is None: - whis = matplotlib.rcParams['boxplot.whiskers'] + whis = matplotlib.rcParams["boxplot.whiskers"] if bootstrap is None: - bootstrap = matplotlib.rcParams['boxplot.bootstrap'] + bootstrap = matplotlib.rcParams["boxplot.bootstrap"] if notch is None: - notch = matplotlib.rcParams['boxplot.notch'] + notch = matplotlib.rcParams["boxplot.notch"] if vert is None: - vert = matplotlib.rcParams['boxplot.vertical'] + vert = matplotlib.rcParams["boxplot.vertical"] if patch_artist is None: - patch_artist = matplotlib.rcParams['boxplot.patchartist'] + patch_artist = matplotlib.rcParams["boxplot.patchartist"] if meanline is None: - meanline = matplotlib.rcParams['boxplot.meanline'] + meanline = matplotlib.rcParams["boxplot.meanline"] if showmeans is None: - showmeans = matplotlib.rcParams['boxplot.showmeans'] + showmeans = matplotlib.rcParams["boxplot.showmeans"] if showcaps is None: - showcaps = matplotlib.rcParams['boxplot.showcaps'] + showcaps = matplotlib.rcParams["boxplot.showcaps"] if showbox is None: - showbox = matplotlib.rcParams['boxplot.showbox'] + showbox = matplotlib.rcParams["boxplot.showbox"] if showfliers is None: - showfliers = matplotlib.rcParams['boxplot.showfliers'] - - return dict(whis=whis, bootstrap=bootstrap, notch=notch, vert=vert, - patch_artist=patch_artist, meanline=meanline, showmeans=showmeans, - showcaps=showcaps, showbox=showbox, showfliers=showfliers) + showfliers = matplotlib.rcParams["boxplot.showfliers"] + + return dict( + whis=whis, + bootstrap=bootstrap, + notch=notch, + vert=vert, + patch_artist=patch_artist, + meanline=meanline, + showmeans=showmeans, + showcaps=showcaps, + showbox=showbox, + showfliers=showfliers, + ) @staticmethod def _compute_stats(data, colname, whis, precision): # Computes mean, median, Q1 and Q3 with approx_percentile and precision - pdf = (data._kdf._sdf - .agg(*[F.expr('approx_percentile({}, {}, {})'.format(colname, q, - int(1. / precision))) - .alias('{}_{}%'.format(colname, int(q * 100))) - for q in [.25, .50, .75]], - F.mean(colname).alias('{}_mean'.format(colname))).toPandas()) + pdf = data._kdf._sdf.agg( + *[ + F.expr( + "approx_percentile({}, {}, {})".format(colname, q, int(1.0 / precision)) + ).alias("{}_{}%".format(colname, int(q * 100))) + for q in [0.25, 0.50, 0.75] + ], + F.mean(colname).alias("{}_mean".format(colname)) + ).toPandas() # Computes IQR and Tukey's fences - iqr = '{}_iqr'.format(colname) - p75 = '{}_75%'.format(colname) - p25 = '{}_25%'.format(colname) + iqr = "{}_iqr".format(colname) + p75 = "{}_75%".format(colname) + p25 = "{}_25%".format(colname) pdf.loc[:, iqr] = pdf.loc[:, p75] - pdf.loc[:, p25] - pdf.loc[:, '{}_lfence'.format(colname)] = pdf.loc[:, p25] - whis * pdf.loc[:, iqr] - pdf.loc[:, '{}_ufence'.format(colname)] = pdf.loc[:, p75] + whis * pdf.loc[:, iqr] + pdf.loc[:, "{}_lfence".format(colname)] = pdf.loc[:, p25] - whis * pdf.loc[:, iqr] + pdf.loc[:, "{}_ufence".format(colname)] = pdf.loc[:, p75] + whis * pdf.loc[:, iqr] - qnames = ['25%', '50%', '75%', 'mean', 'lfence', 'ufence'] - col_summ = pdf[['{}_{}'.format(colname, q) for q in qnames]] + qnames = ["25%", "50%", "75%", "mean", "lfence", "ufence"] + col_summ = pdf[["{}_{}".format(colname, q) for q in qnames]] col_summ.columns = qnames - lfence, ufence = col_summ['lfence'], col_summ['ufence'] + lfence, ufence = col_summ["lfence"], col_summ["ufence"] - stats = {'mean': col_summ['mean'].values[0], - 'med': col_summ['50%'].values[0], - 'q1': col_summ['25%'].values[0], - 'q3': col_summ['75%'].values[0]} + stats = { + "mean": col_summ["mean"].values[0], + "med": col_summ["50%"].values[0], + "q1": col_summ["25%"].values[0], + "q3": col_summ["75%"].values[0], + } return stats, (lfence.values[0], ufence.values[0]) @@ -355,29 +460,31 @@ def _outliers(data, colname, lfence, ufence): # Builds expression to identify outliers expression = F.col(colname).between(lfence, ufence) # Creates a column to flag rows as outliers or not - return data._kdf._sdf.withColumn('__{}_outlier'.format(colname), ~expression) + return data._kdf._sdf.withColumn("__{}_outlier".format(colname), ~expression) @staticmethod def _calc_whiskers(colname, outliers): # Computes min and max values of non-outliers - the whiskers - minmax = (outliers - .filter('not __{}_outlier'.format(colname)) - .agg(F.min(colname).alias('min'), - F.max(colname).alias('max')) - .toPandas()) - return minmax.iloc[0][['min', 'max']].values + minmax = ( + outliers.filter("not __{}_outlier".format(colname)) + .agg(F.min(colname).alias("min"), F.max(colname).alias("max")) + .toPandas() + ) + return minmax.iloc[0][["min", "max"]].values @staticmethod def _get_fliers(colname, outliers): # Filters only the outliers, should "showfliers" be True - fliers_df = outliers.filter('__{}_outlier'.format(colname)) + fliers_df = outliers.filter("__{}_outlier".format(colname)) # If shows fliers, takes the top 1k with highest absolute values - fliers = (fliers_df - .select(F.abs(F.col('`{}`'.format(colname))).alias(colname)) - .orderBy(F.desc('`{}`'.format(colname))) - .limit(1001) - .toPandas()[colname].values) + fliers = ( + fliers_df.select(F.abs(F.col("`{}`".format(colname))).alias(colname)) + .orderBy(F.desc("`{}`".format(colname))) + .limit(1001) + .toPandas()[colname] + .values + ) return fliers @@ -395,13 +502,15 @@ def _compute_plot_data(self): if isinstance(data, Series): data = data.to_frame() - numeric_data = data.select_dtypes(include=['byte', 'decimal', 'integer', 'float', - 'long', 'double', np.datetime64]) + numeric_data = data.select_dtypes( + include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64] + ) # no empty frames or series allowed if len(numeric_data.columns) == 0: - raise TypeError('Empty {0!r}: no numeric data to ' - 'plot'.format(numeric_data.__class__.__name__)) + raise TypeError( + "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__) + ) if is_integer(self.bins): # computes boundaries for the column @@ -426,11 +535,11 @@ def _make_plot(self): kwds = self.kwds.copy() label = pprint_thing(label if len(label) > 1 else label[0]) - kwds['label'] = label + kwds["label"] = label style, kwds = self._apply_style_colors(colors, kwds, i, label) if style is not None: - kwds['style'] = style + kwds["style"] = style # 'y' is a Spark DataFrame that selects one column. # here, we manually calculates the weights separately via Spark @@ -438,19 +547,16 @@ def _make_plot(self): y = KoalasHistPlot._compute_hist(y, self.bins) # now y is a pandas Series. kwds = self._make_plot_keywords(kwds, y) - artists = self._plot(ax, y, column_num=i, - stacking_id=stacking_id, **kwds) + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) self._add_legend_handle(artists[0], label, index=i) @classmethod - def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, - stacking_id=None, **kwds): + def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, stacking_id=None, **kwds): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) base = np.zeros(len(bins) - 1) - bottom = bottom + \ - cls._get_stacked_values(ax, stacking_id, base, kwds['label']) + bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"]) # Since the counts were computed already, we use them as weights and just generate # one entry for each bin @@ -483,34 +589,32 @@ def _compute_hist(sdf, bins): colname = sdf.columns[-1] - bucket_name = '__{}_bucket'.format(colname) + bucket_name = "__{}_bucket".format(colname) # creates a Bucketizer to get corresponding bin of each value - bucketizer = Bucketizer(splits=bins, - inputCol=colname, - outputCol=bucket_name, - handleInvalid="skip") + bucketizer = Bucketizer( + splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip" + ) # after bucketing values, groups and counts them - result = (bucketizer - .transform(sdf) - .select(bucket_name) - .groupby(bucket_name) - .agg(F.count('*').alias('count')) - .toPandas() - .sort_values(by=bucket_name)) + result = ( + bucketizer.transform(sdf) + .select(bucket_name) + .groupby(bucket_name) + .agg(F.count("*").alias("count")) + .toPandas() + .sort_values(by=bucket_name) + ) # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty - indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), - 'bucket': bins[:-1]}) + indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), "bucket": bins[:-1]}) # merges the bins with counts on it and fills remaining ones with zeros - pdf = indexes.merge(result, how='left', on=[bucket_name]).fillna(0)[['count']] + pdf = indexes.merge(result, how="left", on=[bucket_name]).fillna(0)[["count"]] pdf.columns = [bucket_name] return pdf[bucket_name] class KoalasPiePlot(PiePlot, TopNPlot): - def __init__(self, data, **kwargs): super(KoalasPiePlot, self).__init__(self.get_top_n(data), **kwargs) @@ -538,7 +642,6 @@ def _make_plot(self): class KoalasBarhPlot(BarhPlot, TopNPlot): - def __init__(self, data, **kwargs): super(KoalasBarhPlot, self).__init__(self.get_top_n(data), **kwargs) @@ -548,7 +651,6 @@ def _make_plot(self): class KoalasScatterPlot(ScatterPlot, TopNPlot): - def __init__(self, data, x, y, **kwargs): super().__init__(self.get_top_n(data), x, y, **kwargs) @@ -565,13 +667,15 @@ def _compute_plot_data(self): if isinstance(data, Series): data = data.to_frame() - numeric_data = data.select_dtypes(include=['byte', 'decimal', 'integer', 'float', - 'long', 'double', np.datetime64]) + numeric_data = data.select_dtypes( + include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64] + ) # no empty frames or series allowed if len(numeric_data.columns) == 0: - raise TypeError('Empty {0!r}: no numeric data to ' - 'plot'.format(numeric_data.__class__.__name__)) + raise TypeError( + "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__) + ) self.data = numeric_data @@ -591,47 +695,36 @@ def _make_plot(self): kwds = self.kwds.copy() label = pprint_thing(label if len(label) > 1 else label[0]) - kwds['label'] = label + kwds["label"] = label style, kwds = self._apply_style_colors(colors, kwds, i, label) if style is not None: - kwds['style'] = style + kwds["style"] = style kwds = self._make_plot_keywords(kwds, y) - artists = self._plot(ax, y, column_num=i, - stacking_id=stacking_id, **kwds) + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) self._add_legend_handle(artists[0], label, index=i) def _get_ind(self, y): # 'y' is a Spark DataFrame that selects one column. if self.ind is None: - min_val, max_val = y.select( - F.min(y.columns[-1]), F.max(y.columns[-1])).first() + min_val, max_val = y.select(F.min(y.columns[-1]), F.max(y.columns[-1])).first() sample_range = max_val - min_val - ind = np.linspace( - min_val - 0.5 * sample_range, - max_val + 0.5 * sample_range, - 1000, - ) + ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, 1000,) elif is_integer(self.ind): - min_val, max_val = y.select( - F.min(y.columns[-1]), F.max(y.columns[-1])).first() + min_val, max_val = y.select(F.min(y.columns[-1]), F.max(y.columns[-1])).first() sample_range = np.nanmax(y) - np.nanmin(y) - ind = np.linspace( - min_val - 0.5 * sample_range, - max_val + 0.5 * sample_range, - self.ind, - ) + ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, self.ind,) else: ind = self.ind return ind @classmethod def _plot( - cls, ax, y, style=None, bw_method=None, ind=None, - column_num=None, stacking_id=None, **kwds): + cls, ax, y, style=None, bw_method=None, ind=None, column_num=None, stacking_id=None, **kwds + ): # 'y' is a Spark DataFrame that selects one column. # Using RDD is slow so we might have to change it to Dataset based implementation @@ -661,17 +754,36 @@ def _plot( KoalasScatterPlot, KoalasKdePlot, ] -_plot_klass = {getattr(klass, '_kind'): klass for klass in _klasses} - - -def plot_series(data, kind='line', ax=None, # Series unique - figsize=None, use_index=True, title=None, grid=None, - legend=False, style=None, logx=False, logy=False, loglog=False, - xticks=None, yticks=None, xlim=None, ylim=None, - rot=None, fontsize=None, colormap=None, table=False, - yerr=None, xerr=None, - label=None, secondary_y=False, # Series unique - **kwds): +_plot_klass = {getattr(klass, "_kind"): klass for klass in _klasses} + + +def plot_series( + data, + kind="line", + ax=None, # Series unique + figsize=None, + use_index=True, + title=None, + grid=None, + legend=False, + style=None, + logx=False, + logy=False, + loglog=False, + xticks=None, + yticks=None, + xlim=None, + ylim=None, + rot=None, + fontsize=None, + colormap=None, + table=False, + yerr=None, + xerr=None, + label=None, + secondary_y=False, # Series unique + **kwds +): """ Make plots of Series using matplotlib / pylab. @@ -768,30 +880,74 @@ def plot_series(data, kind='line', ax=None, # Series unique # so it calls modified _plot below import matplotlib.pyplot as plt + if ax is None and len(plt.get_fignums()) > 0: ax = None with plt.rc_context(): ax = plt.gca() ax = MPLPlot._get_ax_layer(ax) - return _plot(data, kind=kind, ax=ax, - figsize=figsize, use_index=use_index, title=title, - grid=grid, legend=legend, - style=style, logx=logx, logy=logy, loglog=loglog, - xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, - rot=rot, fontsize=fontsize, colormap=colormap, table=table, - yerr=yerr, xerr=xerr, - label=label, secondary_y=secondary_y, - **kwds) - - -def plot_frame(data, x=None, y=None, kind='line', ax=None, - subplots=None, sharex=None, sharey=False, layout=None, - figsize=None, use_index=True, title=None, grid=None, - legend=True, style=None, logx=False, logy=False, - loglog=False, xticks=None, yticks=None, xlim=None, - ylim=None, rot=None, fontsize=None, colormap=None, - table=False, yerr=None, xerr=None, secondary_y=False, - sort_columns=False, **kwds): + return _plot( + data, + kind=kind, + ax=ax, + figsize=figsize, + use_index=use_index, + title=title, + grid=grid, + legend=legend, + style=style, + logx=logx, + logy=logy, + loglog=loglog, + xticks=xticks, + yticks=yticks, + xlim=xlim, + ylim=ylim, + rot=rot, + fontsize=fontsize, + colormap=colormap, + table=table, + yerr=yerr, + xerr=xerr, + label=label, + secondary_y=secondary_y, + **kwds + ) + + +def plot_frame( + data, + x=None, + y=None, + kind="line", + ax=None, + subplots=None, + sharex=None, + sharey=False, + layout=None, + figsize=None, + use_index=True, + title=None, + grid=None, + legend=True, + style=None, + logx=False, + logy=False, + loglog=False, + xticks=None, + yticks=None, + xlim=None, + ylim=None, + rot=None, + fontsize=None, + colormap=None, + table=False, + yerr=None, + xerr=None, + secondary_y=False, + sort_columns=False, + **kwds +): """ Make plots of DataFrames using matplotlib / pylab. @@ -893,32 +1049,56 @@ def plot_frame(data, x=None, y=None, kind='line', ax=None, From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) """ - return _plot(data, kind=kind, x=x, y=y, ax=ax, - figsize=figsize, use_index=use_index, title=title, - grid=grid, legend=legend, subplots=subplots, - style=style, logx=logx, logy=logy, loglog=loglog, - xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, - rot=rot, fontsize=fontsize, colormap=colormap, table=table, - yerr=yerr, xerr=xerr, sharex=sharex, sharey=sharey, - secondary_y=secondary_y, layout=layout, sort_columns=sort_columns, - **kwds) + return _plot( + data, + kind=kind, + x=x, + y=y, + ax=ax, + figsize=figsize, + use_index=use_index, + title=title, + grid=grid, + legend=legend, + subplots=subplots, + style=style, + logx=logx, + logy=logy, + loglog=loglog, + xticks=xticks, + yticks=yticks, + xlim=xlim, + ylim=ylim, + rot=rot, + fontsize=fontsize, + colormap=colormap, + table=table, + yerr=yerr, + xerr=xerr, + sharex=sharex, + sharey=sharey, + secondary_y=secondary_y, + layout=layout, + sort_columns=sort_columns, + **kwds + ) -def _plot(data, x=None, y=None, subplots=False, - ax=None, kind='line', **kwds): +def _plot(data, x=None, y=None, subplots=False, ax=None, kind="line", **kwds): from databricks.koalas import DataFrame + # function copied from pandas.plotting._core # and adapted to handle Koalas DataFrame and Series kind = kind.lower().strip() - kind = {'density': 'kde'}.get(kind, kind) + kind = {"density": "kde"}.get(kind, kind) if kind in _all_kinds: klass = _plot_klass[kind] else: raise ValueError("%r is not a valid plot kind" % kind) # scatter and hexbin are inherited from PlanePlot which require x and y - if kind in ('scatter', 'hexbin'): + if kind in ("scatter", "hexbin"): plot_obj = klass(data, x, y, subplots=subplots, ax=ax, kind=kind, **kwds) else: @@ -948,22 +1128,61 @@ class KoalasSeriesPlotMethods(PandasObject): def __init__(self, data): self.data = data - def __call__(self, kind='line', ax=None, - figsize=None, use_index=True, title=None, grid=None, - legend=False, style=None, logx=False, logy=False, - loglog=False, xticks=None, yticks=None, - xlim=None, ylim=None, - rot=None, fontsize=None, colormap=None, table=False, - yerr=None, xerr=None, - label=None, secondary_y=False, **kwds): - return plot_series(self.data, kind=kind, ax=ax, figsize=figsize, - use_index=use_index, title=title, grid=grid, - legend=legend, style=style, logx=logx, logy=logy, - loglog=loglog, xticks=xticks, yticks=yticks, - xlim=xlim, ylim=ylim, rot=rot, fontsize=fontsize, - colormap=colormap, table=table, yerr=yerr, - xerr=xerr, label=label, secondary_y=secondary_y, - **kwds) + def __call__( + self, + kind="line", + ax=None, + figsize=None, + use_index=True, + title=None, + grid=None, + legend=False, + style=None, + logx=False, + logy=False, + loglog=False, + xticks=None, + yticks=None, + xlim=None, + ylim=None, + rot=None, + fontsize=None, + colormap=None, + table=False, + yerr=None, + xerr=None, + label=None, + secondary_y=False, + **kwds + ): + return plot_series( + self.data, + kind=kind, + ax=ax, + figsize=figsize, + use_index=use_index, + title=title, + grid=grid, + legend=legend, + style=style, + logx=logx, + logy=logy, + loglog=loglog, + xticks=xticks, + yticks=yticks, + xlim=xlim, + ylim=ylim, + rot=rot, + fontsize=fontsize, + colormap=colormap, + table=table, + yerr=yerr, + xerr=xerr, + label=label, + secondary_y=secondary_y, + **kwds + ) + __call__.__doc__ = plot_series.__doc__ def line(self, x=None, y=None, **kwargs): @@ -1031,7 +1250,7 @@ def bar(self, **kwds): >>> s = ks.Series([1, 3, 2]) >>> ax = s.plot.bar() """ - return self(kind='bar', **kwds) + return self(kind="bar", **kwds) def barh(self, **kwds): """ @@ -1068,7 +1287,7 @@ def barh(self, **kwds): >>> df = ks.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) >>> plot = df.val.plot.barh() """ - return self(kind='barh', **kwds) + return self(kind="barh", **kwds) def box(self, **kwds): """ @@ -1113,7 +1332,7 @@ def box(self, **kwds): >>> df = ks.DataFrame(data, columns=list('ABCD')) >>> ax = df['A'].plot.box() """ - return self(kind='box', **kwds) + return self(kind="box", **kwds) def hist(self, bins=10, **kwds): """ @@ -1141,7 +1360,7 @@ def hist(self, bins=10, **kwds): >>> s = ks.Series([1, 3, 2]) >>> ax = s.plot.hist() """ - return self(kind='hist', bins=bins, **kwds) + return self(kind="hist", bins=bins, **kwds) def kde(self, bw_method=None, ind=None, **kwargs): """ @@ -1232,7 +1451,7 @@ def area(self, **kwds): ... freq='M')) >>> plot = df.sales.plot.area() """ - return self(kind='area', **kwds) + return self(kind="area", **kwds) def pie(self, **kwds): """ @@ -1273,7 +1492,7 @@ def pie(self, **kwds): >>> plot = df.mass.plot.pie(subplots=True, figsize=(6, 3)) """ - return self(kind='pie', **kwds) + return self(kind="pie", **kwds) class KoalasFramePlotMethods(PandasObject): @@ -1285,25 +1504,76 @@ class KoalasFramePlotMethods(PandasObject): with the ``kind`` argument: ``df.plot(kind='hist')`` is equivalent to ``df.plot.hist()`` """ + def __init__(self, data): self.data = data - def __call__(self, x=None, y=None, kind='line', ax=None, - subplots=None, sharex=None, sharey=False, layout=None, - figsize=None, use_index=True, title=None, grid=None, - legend=True, style=None, logx=False, logy=False, - loglog=False, xticks=None, yticks=None, xlim=None, - ylim=None, rot=None, fontsize=None, colormap=None, - table=False, yerr=None, xerr=None, secondary_y=False, - sort_columns=False, **kwds): - return plot_frame(self.data, x=x, y=y, kind=kind, ax=ax, - subplots=subplots, sharex=sharex, sharey=sharey, layout=layout, - figsize=figsize, use_index=use_index, title=title, grid=grid, - legend=legend, style=style, logx=logx, logy=logy, - loglog=loglog, xticks=xticks, yticks=yticks, xlim=xlim, - ylim=ylim, rot=rot, fontsize=fontsize, colormap=colormap, - table=table, yerr=yerr, xerr=xerr, secondary_y=secondary_y, - sort_columns=sort_columns, **kwds) + def __call__( + self, + x=None, + y=None, + kind="line", + ax=None, + subplots=None, + sharex=None, + sharey=False, + layout=None, + figsize=None, + use_index=True, + title=None, + grid=None, + legend=True, + style=None, + logx=False, + logy=False, + loglog=False, + xticks=None, + yticks=None, + xlim=None, + ylim=None, + rot=None, + fontsize=None, + colormap=None, + table=False, + yerr=None, + xerr=None, + secondary_y=False, + sort_columns=False, + **kwds + ): + return plot_frame( + self.data, + x=x, + y=y, + kind=kind, + ax=ax, + subplots=subplots, + sharex=sharex, + sharey=sharey, + layout=layout, + figsize=figsize, + use_index=use_index, + title=title, + grid=grid, + legend=legend, + style=style, + logx=logx, + logy=logy, + loglog=loglog, + xticks=xticks, + yticks=yticks, + xlim=xlim, + ylim=ylim, + rot=rot, + fontsize=fontsize, + colormap=colormap, + table=table, + yerr=yerr, + xerr=xerr, + secondary_y=secondary_y, + sort_columns=sort_columns, + **kwds + ) def line(self, x=None, y=None, **kwargs): """ @@ -1358,7 +1628,7 @@ def line(self, x=None, y=None, **kwargs): >>> lines = df.plot.line(x='pig', y='horse') """ - return self(kind='line', x=x, y=y, **kwargs) + return self(kind="line", x=x, y=y, **kwargs) def kde(self, bw_method=None, ind=None, **kwargs): """ @@ -1452,13 +1722,9 @@ def pie(self, y=None, **kwds): from databricks.koalas import DataFrame # Pandas will raise an error if y is None and subplots if not True - if ( - isinstance(self.data, DataFrame) - and y is None - and not kwds.get("subplots", False) - ): + if isinstance(self.data, DataFrame) and y is None and not kwds.get("subplots", False): raise ValueError("pie requires either y column or 'subplots=True'") - return self(kind='pie', y=y, **kwds) + return self(kind="pie", y=y, **kwds) def area(self, x=None, y=None, stacked=True, **kwds): """ @@ -1499,7 +1765,7 @@ def area(self, x=None, y=None, stacked=True, **kwds): ... freq='M')) >>> plot = df.plot.area() """ - return self(kind='area', x=x, y=y, stacked=stacked, **kwds) + return self(kind="area", x=x, y=y, stacked=stacked, **kwds) def bar(self, x=None, y=None, **kwds): """ @@ -1570,7 +1836,7 @@ def bar(self, x=None, y=None, **kwds): >>> ax = df.plot.bar(x='lifespan', rot=0) """ - return self(kind='bar', x=x, y=y, **kwds) + return self(kind="bar", x=x, y=y, **kwds) def barh(self, x=None, y=None, **kwargs): """ @@ -1647,13 +1913,13 @@ def barh(self, x=None, y=None, **kwargs): ... 'lifespan': lifespan}, index=index) >>> ax = df.plot.barh(x='lifespan') """ - return self(kind='barh', x=x, y=y, **kwargs) + return self(kind="barh", x=x, y=y, **kwargs) def hexbin(self, **kwds): - return _unsupported_function(class_name='pd.DataFrame', method_name='hexbin')() + return _unsupported_function(class_name="pd.DataFrame", method_name="hexbin")() def box(self, **kwds): - return _unsupported_function(class_name='pd.DataFrame', method_name='box')() + return _unsupported_function(class_name="pd.DataFrame", method_name="box")() def hist(self, bins=10, **kwds): """ @@ -1700,7 +1966,7 @@ def hist(self, bins=10, **kwds): >>> df = ks.from_pandas(df) >>> ax = df.plot.hist(bins=12, alpha=0.5) """ - return self(kind='hist', bins=bins, **kwds) + return self(kind="hist", bins=bins, **kwds) def scatter(self, x, y, s=None, c=None, **kwds): """ diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 8d03d15..fef783c 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -40,14 +40,22 @@ from databricks.koalas.exceptions import SparkPandasIndexingError from databricks.koalas.frame import DataFrame from databricks.koalas.generic import _Frame -from databricks.koalas.internal import (_InternalFrame, NATURAL_ORDER_COLUMN_NAME, - SPARK_DEFAULT_INDEX_NAME) +from databricks.koalas.internal import ( + _InternalFrame, + NATURAL_ORDER_COLUMN_NAME, + SPARK_DEFAULT_INDEX_NAME, +) from databricks.koalas.missing.series import _MissingPandasLikeSeries from databricks.koalas.plot import KoalasSeriesPlotMethods from databricks.koalas.ml import corr -from databricks.koalas.utils import (validate_arguments_and_invoke_function, scol_for, - combine_frames, name_like_string, validate_axis, - validate_bool_kwarg) +from databricks.koalas.utils import ( + validate_arguments_and_invoke_function, + scol_for, + combine_frames, + name_like_string, + validate_axis, + validate_bool_kwarg, +) from databricks.koalas.datetimes import DatetimeMethods from databricks.koalas.strings import StringMethods @@ -309,8 +317,9 @@ class Series(_Frame, IndexOpsMixin, Generic[T]): Copy input data """ - def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False, - anchor=None): + def __init__( + self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False, anchor=None + ): if isinstance(data, _InternalFrame): assert dtype is None assert name is None @@ -328,12 +337,14 @@ def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fas s = data else: s = pd.Series( - data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath) + data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath + ) kdf = DataFrame(s) - IndexOpsMixin.__init__(self, - kdf._internal.copy(scol=kdf._internal.column_scols[0]), kdf) + IndexOpsMixin.__init__( + self, kdf._internal.copy(scol=kdf._internal.column_scols[0]), kdf + ) - def _with_new_scol(self, scol: spark.Column) -> 'Series': + def _with_new_scol(self, scol: spark.Column) -> "Series": """ Copy Koalas Series with the new Spark Column. @@ -378,31 +389,34 @@ def add(self, other): return (self + other).rename(self.name) add.__doc__ = _flex_doc_SERIES.format( - desc='Addition', + desc="Addition", op_name="+", equiv="series + other", - reverse='radd', - series_examples=_add_example_SERIES) + reverse="radd", + series_examples=_add_example_SERIES, + ) def radd(self, other): return (other + self).rename(self.name) radd.__doc__ = _flex_doc_SERIES.format( - desc='Reverse Addition', + desc="Reverse Addition", op_name="+", equiv="other + series", - reverse='add', - series_examples=_add_example_SERIES) + reverse="add", + series_examples=_add_example_SERIES, + ) def div(self, other): return (self / other).rename(self.name) div.__doc__ = _flex_doc_SERIES.format( - desc='Floating division', + desc="Floating division", op_name="/", equiv="series / other", - reverse='rdiv', - series_examples=_div_example_SERIES) + reverse="rdiv", + series_examples=_div_example_SERIES, + ) divide = div @@ -410,41 +424,45 @@ def rdiv(self, other): return (other / self).rename(self.name) rdiv.__doc__ = _flex_doc_SERIES.format( - desc='Reverse Floating division', + desc="Reverse Floating division", op_name="/", equiv="other / series", - reverse='div', - series_examples=_div_example_SERIES) + reverse="div", + series_examples=_div_example_SERIES, + ) def truediv(self, other): return (self / other).rename(self.name) truediv.__doc__ = _flex_doc_SERIES.format( - desc='Floating division', + desc="Floating division", op_name="/", equiv="series / other", - reverse='rtruediv', - series_examples=_div_example_SERIES) + reverse="rtruediv", + series_examples=_div_example_SERIES, + ) def rtruediv(self, other): return (other / self).rename(self.name) rtruediv.__doc__ = _flex_doc_SERIES.format( - desc='Reverse Floating division', + desc="Reverse Floating division", op_name="/", equiv="other / series", - reverse='truediv', - series_examples=_div_example_SERIES) + reverse="truediv", + series_examples=_div_example_SERIES, + ) def mul(self, other): return (self * other).rename(self.name) mul.__doc__ = _flex_doc_SERIES.format( - desc='Multiplication', + desc="Multiplication", op_name="*", equiv="series * other", - reverse='rmul', - series_examples=_mul_example_SERIES) + reverse="rmul", + series_examples=_mul_example_SERIES, + ) multiply = mul @@ -452,21 +470,23 @@ def rmul(self, other): return (other * self).rename(self.name) rmul.__doc__ = _flex_doc_SERIES.format( - desc='Reverse Multiplication', + desc="Reverse Multiplication", op_name="*", equiv="other * series", - reverse='mul', - series_examples=_mul_example_SERIES) + reverse="mul", + series_examples=_mul_example_SERIES, + ) def sub(self, other): return (self - other).rename(self.name) sub.__doc__ = _flex_doc_SERIES.format( - desc='Subtraction', + desc="Subtraction", op_name="-", equiv="series - other", - reverse='rsub', - series_examples=_sub_example_SERIES) + reverse="rsub", + series_examples=_sub_example_SERIES, + ) subtract = sub @@ -474,71 +494,78 @@ def rsub(self, other): return (other - self).rename(self.name) rsub.__doc__ = _flex_doc_SERIES.format( - desc='Reverse Subtraction', + desc="Reverse Subtraction", op_name="-", equiv="other - series", - reverse='sub', - series_examples=_sub_example_SERIES) + reverse="sub", + series_examples=_sub_example_SERIES, + ) def mod(self, other): return (self % other).rename(self.name) mod.__doc__ = _flex_doc_SERIES.format( - desc='Modulo', - op_name='%', - equiv='series % other', - reverse='rmod', - series_examples=_mod_example_SERIES) + desc="Modulo", + op_name="%", + equiv="series % other", + reverse="rmod", + series_examples=_mod_example_SERIES, + ) def rmod(self, other): return (other % self).rename(self.name) rmod.__doc__ = _flex_doc_SERIES.format( - desc='Reverse Modulo', - op_name='%', - equiv='other % series', - reverse='mod', - series_examples=_mod_example_SERIES) + desc="Reverse Modulo", + op_name="%", + equiv="other % series", + reverse="mod", + series_examples=_mod_example_SERIES, + ) def pow(self, other): return (self ** other).rename(self.name) pow.__doc__ = _flex_doc_SERIES.format( - desc='Exponential power of series', - op_name='**', - equiv='series ** other', - reverse='rpow', - series_examples=_pow_example_SERIES) + desc="Exponential power of series", + op_name="**", + equiv="series ** other", + reverse="rpow", + series_examples=_pow_example_SERIES, + ) def rpow(self, other): return (other ** self).rename(self.name) rpow.__doc__ = _flex_doc_SERIES.format( - desc='Reverse Exponential power', - op_name='**', - equiv='other ** series', - reverse='pow', - series_examples=_pow_example_SERIES) + desc="Reverse Exponential power", + op_name="**", + equiv="other ** series", + reverse="pow", + series_examples=_pow_example_SERIES, + ) def floordiv(self, other): return (self // other).rename(self.name) floordiv.__doc__ = _flex_doc_SERIES.format( - desc='Integer division', - op_name='//', - equiv='series // other', - reverse='rfloordiv', - series_examples=_floordiv_example_SERIES) + desc="Integer division", + op_name="//", + equiv="series // other", + reverse="rfloordiv", + series_examples=_floordiv_example_SERIES, + ) def rfloordiv(self, other): return (other // self).rename(self.name) rfloordiv.__doc__ = _flex_doc_SERIES.format( - desc='Reverse Integer division', - op_name='//', - equiv='other // series', - reverse='floordiv', - series_examples=_floordiv_example_SERIES) + desc="Reverse Integer division", + op_name="//", + equiv="other // series", + reverse="floordiv", + series_examples=_floordiv_example_SERIES, + ) # Comparison Operators def eq(self, other): @@ -759,7 +786,7 @@ def between(self, left, right, inclusive=True): lmask = self > left rmask = self < right - return (lmask & rmask) + return lmask & rmask # TODO: arg should support Series # TODO: NaN and None @@ -853,7 +880,7 @@ def map(self, arg): else: return self.apply(arg) - def astype(self, dtype) -> 'Series': + def astype(self, dtype) -> "Series": """ Cast a Koalas object to a specified dtype ``dtype``. @@ -885,6 +912,7 @@ def astype(self, dtype) -> 'Series': Name: 0, dtype: int64 """ from databricks.koalas.typedef import as_spark_type + spark_type = as_spark_type(dtype) if not spark_type: raise ValueError("Type {} not understood".format(dtype)) @@ -897,7 +925,8 @@ def getField(self, name): fnames = self.spark_type.fieldNames() if name not in fnames: raise AttributeError( - "Field {} not found, possible values are {}".format(name, ", ".join(fnames))) + "Field {} not found, possible values are {}".format(name, ", ".join(fnames)) + ) return self._with_new_scol(self._scol.getField(name)) def alias(self, name): @@ -907,7 +936,7 @@ def alias(self, name): @property def shape(self): """Return a tuple of the shape of the underlying data.""" - return len(self), + return (len(self),) @property def name(self) -> Union[str, Tuple[str, ...]]: @@ -964,8 +993,9 @@ def rename(self, index: Union[str, Tuple[str, ...]] = None, **kwargs): scol = self._scol.alias(name_like_string(index)) internal = self._internal.copy( scol=scol, - column_labels=[index if index is None or isinstance(index, tuple) else (index,)]) - if kwargs.get('inplace', False): + column_labels=[index if index is None or isinstance(index, tuple) else (index,)], + ) + if kwargs.get("inplace", False): self._internal = internal return self else: @@ -1006,8 +1036,8 @@ def is_unique(self): # This workaround is in order to calculate the distinct count including nulls in # single pass. Note that COUNT(DISTINCT expr) in Spark is designed to ignore nulls. return self._internal._sdf.select( - (F.count(scol) == F.countDistinct(scol)) & - (F.count(F.when(scol.isNull(), 1).otherwise(None)) <= 1) + (F.count(scol) == F.countDistinct(scol)) + & (F.count(F.when(scol.isNull(), 1).otherwise(None)) <= 1) ).collect()[0][0] def reset_index(self, level=None, drop=False, name=None, inplace=False): @@ -1085,7 +1115,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): """ inplace = validate_bool_kwarg(inplace, "inplace") if inplace and not drop: - raise TypeError('Cannot reset_index inplace on a Series to create a DataFrame') + raise TypeError("Cannot reset_index inplace on a Series to create a DataFrame") if name is not None: kdf = self.rename(name).to_dataframe() @@ -1140,23 +1170,34 @@ def to_frame(self, name: Union[str, Tuple[str, ...]] = None) -> spark.DataFrame: sdf = renamed._internal.spark_internal_df column_labels = None # type: Optional[List[Tuple[str, ...]]] if renamed._internal.column_labels[0] is None: - column_labels = [('0',)] + column_labels = [("0",)] column_label_names = None else: column_labels = renamed._internal.column_labels column_label_names = renamed._internal.column_label_names - internal = _InternalFrame(sdf=sdf, - index_map=renamed._internal.index_map, - column_labels=column_labels, - column_scols=[scol_for(sdf, sdf.columns[-1])], - column_label_names=column_label_names) + internal = _InternalFrame( + sdf=sdf, + index_map=renamed._internal.index_map, + column_labels=column_labels, + column_scols=[scol_for(sdf, sdf.columns[-1])], + column_label_names=column_label_names, + ) return DataFrame(internal) to_dataframe = to_frame - def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, - index=True, length=False, dtype=False, name=False, - max_rows=None): + def to_string( + self, + buf=None, + na_rep="NaN", + float_format=None, + header=True, + index=True, + length=False, + dtype=False, + name=False, + max_rows=None, + ): """ Render a string representation of the Series. @@ -1212,7 +1253,8 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, kseries = self return validate_arguments_and_invoke_function( - kseries._to_internal_pandas(), self.to_string, pd.Series.to_string, args) + kseries._to_internal_pandas(), self.to_string, pd.Series.to_string, args + ) def to_clipboard(self, excel=True, sep=None, **kwargs): # Docstring defined below by reusing DataFrame.to_clipboard's. @@ -1220,7 +1262,8 @@ def to_clipboard(self, excel=True, sep=None, **kwargs): kseries = self return validate_arguments_and_invoke_function( - kseries._to_internal_pandas(), self.to_clipboard, pd.Series.to_clipboard, args) + kseries._to_internal_pandas(), self.to_clipboard, pd.Series.to_clipboard, args + ) to_clipboard.__doc__ = DataFrame.to_clipboard.__doc__ @@ -1263,17 +1306,37 @@ def to_dict(self, into=dict): args = locals() kseries = self return validate_arguments_and_invoke_function( - kseries._to_internal_pandas(), self.to_dict, pd.Series.to_dict, args) - - def to_latex(self, buf=None, columns=None, col_space=None, header=True, index=True, - na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, - bold_rows=False, column_format=None, longtable=None, escape=None, encoding=None, - decimal='.', multicolumn=None, multicolumn_format=None, multirow=None): + kseries._to_internal_pandas(), self.to_dict, pd.Series.to_dict, args + ) + + def to_latex( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal=".", + multicolumn=None, + multicolumn_format=None, + multirow=None, + ): args = locals() kseries = self return validate_arguments_and_invoke_function( - kseries._to_internal_pandas(), self.to_latex, pd.Series.to_latex, args) + kseries._to_internal_pandas(), self.to_latex, pd.Series.to_latex, args + ) to_latex.__doc__ = DataFrame.to_latex.__doc__ @@ -1445,7 +1508,7 @@ def _fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, raise NotImplementedError("fillna currently only works for axis=0 or axis='index'") if (value is None) and (method is None): raise ValueError("Must specify a fillna 'value' or 'method' parameter.") - if (method is not None) and (method not in ['ffill', 'pad', 'backfill', 'bfill']): + if (method is not None) and (method not in ["ffill", "pad", "backfill", "bfill"]): raise ValueError("Expecting 'pad', 'ffill', 'backfill' or 'bfill'.") if self.isnull().sum() == 0: if inplace: @@ -1461,17 +1524,17 @@ def _fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, if not isinstance(value, (float, int, str, bool)): raise TypeError("Unsupported type %s" % type(value)) if limit is not None: - raise ValueError('limit parameter for value is not support now') + raise ValueError("limit parameter for value is not support now") scol = F.when(scol.isNull(), value).otherwise(scol) else: - if method in ['ffill', 'pad']: + if method in ["ffill", "pad"]: func = F.last - end = (Window.currentRow - 1) + end = Window.currentRow - 1 if limit is not None: begin = Window.currentRow - limit else: begin = Window.unboundedPreceding - elif method in ['bfill', 'backfill']: + elif method in ["bfill", "backfill"]: func = F.first begin = Window.currentRow + 1 if limit is not None: @@ -1479,8 +1542,11 @@ def _fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, else: end = Window.unboundedFollowing - window = Window.partitionBy(*part_cols).orderBy(NATURAL_ORDER_COLUMN_NAME) \ + window = ( + Window.partitionBy(*part_cols) + .orderBy(NATURAL_ORDER_COLUMN_NAME) .rowsBetween(begin, end) + ) scol = F.when(scol.isNull(), func(scol, True).over(window)).otherwise(scol) kseries = self._with_new_scol(scol).rename(column_name) if inplace: @@ -1540,7 +1606,7 @@ def dropna(self, axis=0, inplace=False, **kwargs): else: return kseries - def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None) -> 'Series': + def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None) -> "Series": """ Trim values at input threshold(s). @@ -1575,10 +1641,12 @@ def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None) """ return _col(self.to_dataframe().clip(lower, upper)) - def drop(self, - labels=None, - index: Union[str, Tuple[str, ...], List[str], List[Tuple[str, ...]]] = None, - level=None): + def drop( + self, + labels=None, + index: Union[str, Tuple[str, ...], List[str], List[Tuple[str, ...]]] = None, + level=None, + ): """ Return Series with specified index labels removed. @@ -1704,24 +1772,38 @@ def drop(self, elif isinstance(index, tuple): index = [index] else: - if not (all((isinstance(idxes, str) for idxes in index)) or - all((isinstance(idxes, tuple) for idxes in index))): - raise ValueError("If the given index is a list, it " - "should only contains names as strings, " - "or a list of tuples that contain " - "index names as strings") - index = [idxes if isinstance(idxes, tuple) else (idxes,) # type: ignore - for idxes in index] + if not ( + all((isinstance(idxes, str) for idxes in index)) + or all((isinstance(idxes, tuple) for idxes in index)) + ): + raise ValueError( + "If the given index is a list, it " + "should only contains names as strings, " + "or a list of tuples that contain " + "index names as strings" + ) + new_index = [] + for idxes in index: + if isinstance(idxes, tuple): + new_index.append(idxes) + else: + new_index.append((idxes,)) + index = new_index drop_index_scols = [] for idxes in index: try: - index_scols = [self._internal.index_scols[lvl] == idx - for lvl, idx in enumerate(idxes, level)] + index_scols = [ + self._internal.index_scols[lvl] == idx + for lvl, idx in enumerate(idxes, level) + ] except IndexError: if level_param is None: - raise KeyError("Key length ({}) exceeds index depth ({})" - .format(len(self._internal.index_scols), len(idxes))) + raise KeyError( + "Key length ({}) exceeds index depth ({})".format( + len(self._internal.index_scols), len(idxes) + ) + ) else: return self drop_index_scols.append(reduce(lambda x, y: x & y, index_scols)) @@ -1797,15 +1879,18 @@ def unique(self): Name: (x, a), dtype: int64 """ sdf = self._internal.sdf.select(self._scol).distinct() - internal = _InternalFrame(sdf=sdf, - index_map=None, - column_labels=[self._internal.column_labels[0]], - column_scols=[scol_for(sdf, self._internal.data_columns[0])], - column_label_names=self._internal.column_label_names) + internal = _InternalFrame( + sdf=sdf, + index_map=None, + column_labels=[self._internal.column_labels[0]], + column_scols=[scol_for(sdf, self._internal.data_columns[0])], + column_label_names=self._internal.column_label_names, + ) return _col(DataFrame(internal)) - def sort_values(self, ascending: bool = True, inplace: bool = False, - na_position: str = 'last') -> Union['Series', None]: + def sort_values( + self, ascending: bool = True, inplace: bool = False, na_position: str = "last" + ) -> Union["Series", None]: """ Sort by the values. @@ -1898,8 +1983,11 @@ def sort_values(self, ascending: bool = True, inplace: bool = False, Name: 0, dtype: object """ inplace = validate_bool_kwarg(inplace, "inplace") - kseries = _col(self.to_dataframe().sort_values(by=self.name, ascending=ascending, - na_position=na_position)) + kseries = _col( + self.to_dataframe().sort_values( + by=self.name, ascending=ascending, na_position=na_position + ) + ) if inplace: self._internal = kseries._internal self._kdf = kseries._kdf @@ -1907,10 +1995,15 @@ def sort_values(self, ascending: bool = True, inplace: bool = False, else: return kseries - def sort_index(self, axis: int = 0, - level: Optional[Union[int, List[int]]] = None, ascending: bool = True, - inplace: bool = False, kind: str = None, na_position: str = 'last') \ - -> Optional['Series']: + def sort_index( + self, + axis: int = 0, + level: Optional[Union[int, List[int]]] = None, + ascending: bool = True, + inplace: bool = False, + kind: str = None, + na_position: str = "last", + ) -> Optional["Series"]: """ Sort object by labels (along an axis) @@ -1986,8 +2079,11 @@ def sort_index(self, axis: int = 0, Name: 0, dtype: int64 """ inplace = validate_bool_kwarg(inplace, "inplace") - kseries = _col(self.to_dataframe().sort_index(axis=axis, level=level, ascending=ascending, - kind=kind, na_position=na_position)) + kseries = _col( + self.to_dataframe().sort_index( + axis=axis, level=level, ascending=ascending, kind=kind, na_position=na_position + ) + ) if inplace: self._internal = kseries._internal self._kdf = kseries._kdf @@ -2039,9 +2135,13 @@ def add_prefix(self, prefix): kdf = self.to_dataframe() internal = kdf._internal sdf = internal.sdf - sdf = sdf.select([F.concat(F.lit(prefix), - internal.scol_for(index_column)).alias(index_column) - for index_column in internal.index_columns] + internal.column_scols) + sdf = sdf.select( + [ + F.concat(F.lit(prefix), internal.scol_for(index_column)).alias(index_column) + for index_column in internal.index_columns + ] + + internal.column_scols + ) kdf._internal = internal.with_new_sdf(sdf) return _col(kdf) @@ -2089,13 +2189,17 @@ def add_suffix(self, suffix): kdf = self.to_dataframe() internal = kdf._internal sdf = internal.sdf - sdf = sdf.select([F.concat(internal.scol_for(index_column), - F.lit(suffix)).alias(index_column) - for index_column in internal.index_columns] + internal.column_scols) + sdf = sdf.select( + [ + F.concat(internal.scol_for(index_column), F.lit(suffix)).alias(index_column) + for index_column in internal.index_columns + ] + + internal.column_scols + ) kdf._internal = internal.with_new_sdf(sdf) return _col(kdf) - def corr(self, other, method='pearson'): + def corr(self, other, method="pearson"): """ Compute correlation with `other` Series, excluding missing values. @@ -2134,13 +2238,13 @@ def corr(self, other, method='pearson'): """ # This implementation is suboptimal because it computes more than necessary, # but it should be a start - columns = ['__corr_arg1__', '__corr_arg2__'] + columns = ["__corr_arg1__", "__corr_arg2__"] kdf = self._kdf.assign(__corr_arg1__=self, __corr_arg2__=other)[columns] kdf.columns = columns c = corr(kdf, method=method) return c.loc[tuple(columns)] - def nsmallest(self, n: int = 5) -> 'Series': + def nsmallest(self, n: int = 5) -> "Series": """ Return the smallest `n` elements. @@ -2200,7 +2304,7 @@ def nsmallest(self, n: int = 5) -> 'Series': """ return _col(self.to_frame().nsmallest(n=n, columns=self.name)) - def nlargest(self, n: int = 5) -> 'Series': + def nlargest(self, n: int = 5) -> "Series": """ Return the largest `n` elements. @@ -2288,8 +2392,9 @@ def count(self): """ return self._reduce_for_stat_function(_Frame._count_expr, name="count") - def append(self, to_append: 'Series', ignore_index: bool = False, - verify_integrity: bool = False) -> 'Series': + def append( + self, to_append: "Series", ignore_index: bool = False, verify_integrity: bool = False + ) -> "Series": """ Concatenate two or more Series. @@ -2340,13 +2445,20 @@ def append(self, to_append: 'Series', ignore_index: bool = False, 5 6 Name: 0, dtype: int64 """ - return _col(self.to_dataframe().append(to_append.to_dataframe(), ignore_index, - verify_integrity)) - - def sample(self, n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, - random_state: Optional[int] = None) -> 'Series': - return _col(self.to_dataframe().sample( - n=n, frac=frac, replace=replace, random_state=random_state)) + return _col( + self.to_dataframe().append(to_append.to_dataframe(), ignore_index, verify_integrity) + ) + + def sample( + self, + n: Optional[int] = None, + frac: Optional[float] = None, + replace: bool = False, + random_state: Optional[int] = None, + ) -> "Series": + return _col( + self.to_dataframe().sample(n=n, frac=frac, replace=replace, random_state=random_state) + ) sample.__doc__ = DataFrame.sample.__doc__ @@ -2480,8 +2592,7 @@ def apply(self, func, args=(), **kwds): transformed = pser.apply(func, *args, **kwds) kser = Series(transformed) - wrapped = ks.pandas_wraps( - return_col=as_python_type(kser.spark_type))(apply_each) + wrapped = ks.pandas_wraps(return_col=as_python_type(kser.spark_type))(apply_each) else: wrapped = ks.pandas_wraps(return_col=return_sig)(apply_each) return wrapped(self, *args, **kwds).rename(self.name) @@ -2722,10 +2833,10 @@ def quantile(self, q=0.5, accuracy=10000): for v in q if isinstance(q, list) else [q]: if not isinstance(v, float): raise ValueError( - "q must be a float of an array of floats; however, [%s] found." % type(v)) + "q must be a float of an array of floats; however, [%s] found." % type(v) + ) if v < 0.0 or v > 1.0: - raise ValueError( - "percentiles should all be in the interval [0, 1].") + raise ValueError("percentiles should all be in the interval [0, 1].") if isinstance(q, list): quantiles = q @@ -2743,16 +2854,20 @@ def quantile(self, q=0.5, accuracy=10000): sdf = self._internal._sdf args = ", ".join(map(str, quantiles)) percentile_col = F.expr( - "approx_percentile(`%s`, array(%s), %s)" % (self.name, args, accuracy)) + "approx_percentile(`%s`, array(%s), %s)" % (self.name, args, accuracy) + ) sdf = sdf.select(percentile_col.alias("percentiles")) internal_index_column = SPARK_DEFAULT_INDEX_NAME value_column = "value" cols = [] for i, quantile in enumerate(quantiles): - cols.append(F.struct( - F.lit("%s" % quantile).alias(internal_index_column), - F.expr("percentiles[%s]" % i).alias(value_column))) + cols.append( + F.struct( + F.lit("%s" % quantile).alias(internal_index_column), + F.expr("percentiles[%s]" % i).alias(value_column), + ) + ) sdf = sdf.select(F.array(*cols).alias("arrays")) # And then, explode it and manually set the index. @@ -2771,16 +2886,18 @@ def quantile(self, q=0.5, accuracy=10000): index_map=[(internal_index_column, None)], column_labels=None, column_scols=[scol_for(sdf, value_column)], - column_label_names=None) + column_label_names=None, + ) return DataFrame(internal)[value_column].rename(self.name) else: return self._reduce_for_stat_function( lambda _: F.expr("approx_percentile(`%s`, %s, %s)" % (self.name, q, accuracy)), - name="median") + name="median", + ) # TODO: add axis, numeric_only, pct, na_option parameter - def rank(self, method='average', ascending=True): + def rank(self, method="average", ascending=True): """ Compute numerical data ranks (1 through n) along axis. Equal values are assigned a rank that is the average of the ranks of those values. @@ -2860,46 +2977,55 @@ def rank(self, method='average', ascending=True): """ return self._rank(method, ascending) - def _rank(self, method='average', ascending=True, part_cols=()): - if method not in ['average', 'min', 'max', 'first', 'dense']: + def _rank(self, method="average", ascending=True, part_cols=()): + if method not in ["average", "min", "max", "first", "dense"]: msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'" raise ValueError(msg) if len(self._internal.index_columns) > 1: - raise ValueError('rank do not support index now') + raise ValueError("rank do not support index now") if ascending: asc_func = lambda scol: scol.asc() else: asc_func = lambda scol: scol.desc() - if method == 'first': - window = Window.orderBy( - asc_func(self._internal.scol), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)) - ).partitionBy(*part_cols).rowsBetween(Window.unboundedPreceding, Window.currentRow) + if method == "first": + window = ( + Window.orderBy( + asc_func(self._internal.scol), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)) + ) + .partitionBy(*part_cols) + .rowsBetween(Window.unboundedPreceding, Window.currentRow) + ) scol = F.row_number().over(window) - elif method == 'dense': - window = Window.orderBy(asc_func(self._internal.scol)).partitionBy(*part_cols) \ + elif method == "dense": + window = ( + Window.orderBy(asc_func(self._internal.scol)) + .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) + ) scol = F.dense_rank().over(window) else: - if method == 'average': + if method == "average": stat_func = F.mean - elif method == 'min': + elif method == "min": stat_func = F.min - elif method == 'max': + elif method == "max": stat_func = F.max - window1 = Window.orderBy( - asc_func(self._internal.scol) - ).partitionBy(*part_cols).rowsBetween(Window.unboundedPreceding, Window.currentRow) - window2 = Window.partitionBy( - [self._internal.scol] + list(part_cols) - ).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) + window1 = ( + Window.orderBy(asc_func(self._internal.scol)) + .partitionBy(*part_cols) + .rowsBetween(Window.unboundedPreceding, Window.currentRow) + ) + window2 = Window.partitionBy([self._internal.scol] + list(part_cols)).rowsBetween( + Window.unboundedPreceding, Window.unboundedFollowing + ) scol = stat_func(F.row_number().over(window1)).over(window2) kser = self._with_new_scol(scol).rename(self.name) return kser.astype(np.float64) - def describe(self, percentiles: Optional[List[float]] = None) -> 'Series': + def describe(self, percentiles: Optional[List[float]] = None) -> "Series": return _col(self.to_dataframe().describe(percentiles)) describe.__doc__ = DataFrame.describe.__doc__ @@ -2974,9 +3100,12 @@ def diff(self, periods=1): def _diff(self, periods, part_cols=()): if not isinstance(periods, int): - raise ValueError('periods should be an int; however, got [%s]' % type(periods)) - window = Window.partitionBy(*part_cols).orderBy(NATURAL_ORDER_COLUMN_NAME) \ + raise ValueError("periods should be an int; however, got [%s]" % type(periods)) + window = ( + Window.partitionBy(*part_cols) + .orderBy(NATURAL_ORDER_COLUMN_NAME) .rowsBetween(-periods, -periods) + ) scol = self._scol - F.lag(self._scol, periods).over(window) return self._with_new_scol(scol).rename(self.name) @@ -3320,19 +3449,22 @@ def pop(self, item): if isinstance(item, str): item = (item,) if not all(isinstance(index, str) for index in item): - raise ValueError("'key' should have index names as only strings " - "or a tuple that contain index names as only strings") + raise ValueError( + "'key' should have index names as only strings " + "or a tuple that contain index names as only strings" + ) if len(self._internal._index_map) < len(item): - raise KeyError("Key length ({}) exceeds index depth ({})" - .format(len(item), len(self._internal.index_map))) - - cols = (self._internal.index_scols[len(item):] + - [self._internal.scol_for(self._internal.column_labels[0])]) - rows = [self._internal.scols[level] == index - for level, index in enumerate(item)] - sdf = self._internal.sdf \ - .select(cols) \ - .filter(reduce(lambda x, y: x & y, rows)) + raise KeyError( + "Key length ({}) exceeds index depth ({})".format( + len(item), len(self._internal.index_map) + ) + ) + + cols = self._internal.index_scols[len(item) :] + [ + self._internal.scol_for(self._internal.column_labels[0]) + ] + rows = [self._internal.scols[level] == index for level, index in enumerate(item)] + sdf = self._internal.sdf.select(cols).filter(reduce(lambda x, y: x & y, rows)) if len(self._internal._index_map) == len(item): # if sdf has one column and one data, return data only without frame @@ -3348,15 +3480,13 @@ def pop(self, item): internal = _InternalFrame(sdf=sdf, index_map=[(SPARK_DEFAULT_INDEX_NAME, None)]) return _col(DataFrame(internal)) - internal = self._internal.copy( - sdf=sdf, - index_map=self._internal._index_map[len(item):]) + internal = self._internal.copy(sdf=sdf, index_map=self._internal._index_map[len(item) :]) self._internal = self.drop(item)._internal return _col(DataFrame(internal)) - def copy(self) -> 'Series': + def copy(self) -> "Series": """ Make a copy of this object's indices and data. @@ -3461,7 +3591,7 @@ def truncate(self, before=None, after=None, copy=True): return result.copy() if copy else result - def mode(self, dropna=True) -> 'Series': + def mode(self, dropna=True) -> "Series": """ Return the mode(s) of the dataset. @@ -3537,7 +3667,7 @@ def mode(self, dropna=True) -> 'Series': sdf_count = ser_count._internal.sdf most_value = ser_count.max() sdf_most_value = sdf_count.filter("count == {}".format(most_value)) - sdf = sdf_most_value.select(F.col(SPARK_DEFAULT_INDEX_NAME).alias('0')) + sdf = sdf_most_value.select(F.col(SPARK_DEFAULT_INDEX_NAME).alias("0")) internal = _InternalFrame(sdf=sdf, index_map=None) result = _col(DataFrame(internal)) @@ -3577,7 +3707,7 @@ def keys(self): return self.index # TODO: 'regex', 'method' parameter - def replace(self, to_replace=None, value=None, regex=False) -> 'Series': + def replace(self, to_replace=None, value=None, regex=False) -> "Series": """ Replace values given in to_replace with value. Values of the Series are replaced with other values dynamically. @@ -3732,14 +3862,16 @@ def replace(self, to_replace=None, value=None, regex=False) -> 'Series': if to_replace is None: return self if not isinstance(to_replace, (str, list, dict, int, float)): - raise ValueError( - "'to_replace' should be one of str, list, dict, int, float") + raise ValueError("'to_replace' should be one of str, list, dict, int, float") if regex: raise NotImplementedError("replace currently not support for regex") if isinstance(to_replace, list) and isinstance(value, list): if not len(to_replace) == len(value): - raise ValueError("Replacement lists must match in length. Expecting {} got {}" - .format(len(to_replace), len(value))) + raise ValueError( + "Replacement lists must match in length. Expecting {} got {}".format( + len(to_replace), len(value) + ) + ) to_replace = {k: v for k, v in zip(to_replace, value)} if isinstance(to_replace, dict): is_start = True @@ -3832,20 +3964,24 @@ def update(self, other): raise ValueError("'other' must be a Series") index_scol_names = [index_map[0] for index_map in self._internal.index_map] - combined = combine_frames(self.to_frame(), other.to_frame(), how='leftouter') + combined = combine_frames(self.to_frame(), other.to_frame(), how="leftouter") combined_sdf = combined._sdf this_col = "__this_%s" % str( - self._internal.column_name_for(self._internal.column_labels[0])) + self._internal.column_name_for(self._internal.column_labels[0]) + ) that_col = "__that_%s" % str( - self._internal.column_name_for(other._internal.column_labels[0])) - cond = F.when(scol_for(combined_sdf, that_col).isNotNull(), - scol_for(combined_sdf, that_col)) \ - .otherwise(combined_sdf[this_col]) \ - .alias(str(self._internal.column_name_for(self._internal.column_labels[0]))) + self._internal.column_name_for(other._internal.column_labels[0]) + ) + cond = ( + F.when(scol_for(combined_sdf, that_col).isNotNull(), scol_for(combined_sdf, that_col)) + .otherwise(combined_sdf[this_col]) + .alias(str(self._internal.column_name_for(self._internal.column_labels[0]))) + ) internal = _InternalFrame( sdf=combined_sdf.select(index_scol_names + [cond]), index_map=self._internal.index_map, - column_labels=self._internal.column_labels) + column_labels=self._internal.column_labels, + ) self_updated = _col(ks.DataFrame(internal)) self._internal = self_updated._internal self._kdf = self_updated._kdf @@ -3910,15 +4046,15 @@ def where(self, cond, other=np.nan): assert isinstance(cond, Series) # We should check the DataFrame from both `cond` and `other`. - should_try_ops_on_diff_frame = ( - cond._kdf is not self._kdf or - (isinstance(other, Series) and other._kdf is not self._kdf)) + should_try_ops_on_diff_frame = cond._kdf is not self._kdf or ( + isinstance(other, Series) and other._kdf is not self._kdf + ) if should_try_ops_on_diff_frame: # Try to perform it with 'compute.ops_on_diff_frame' option. kdf = self.to_frame() - kdf['__tmp_cond_col__'] = cond - kdf['__tmp_other_col__'] = other + kdf["__tmp_cond_col__"] = cond + kdf["__tmp_other_col__"] = other # above logic makes a Spark DataFrame looks like below: # +-----------------+---+----------------+-----------------+ @@ -3930,19 +4066,24 @@ def where(self, cond, other=np.nan): # | 2| 2| true| 300| # | 4| 4| true| 500| # +-----------------+---+----------------+-----------------+ - condition = F.when( - kdf['__tmp_cond_col__']._scol, kdf[self._internal.column_labels[0]]._scol - ).otherwise(kdf['__tmp_other_col__']._scol).alias(self._internal.data_columns[0]) - - internal = kdf._internal.with_new_columns([condition], - column_labels=self._internal.column_labels) + condition = ( + F.when(kdf["__tmp_cond_col__"]._scol, kdf[self._internal.column_labels[0]]._scol) + .otherwise(kdf["__tmp_other_col__"]._scol) + .alias(self._internal.data_columns[0]) + ) + + internal = kdf._internal.with_new_columns( + [condition], column_labels=self._internal.column_labels + ) return _col(DataFrame(internal)) else: if isinstance(other, Series): other = other._scol - condition = F.when( - cond._scol, self._scol - ).otherwise(other).alias(self._internal.data_columns[0]) + condition = ( + F.when(cond._scol, self._scol) + .otherwise(other) + .alias(self._internal.data_columns[0]) + ) return self._with_new_scol(condition) def mask(self, cond, other=np.nan): @@ -4076,14 +4217,13 @@ def xs(self, key, level=None): if level is None: level = 0 - cols = (self._internal.index_scols[:level] + - self._internal.index_scols[level+len(key):] + - [self._internal.scol_for(self._internal.column_labels[0])]) - rows = [self._internal.scols[lvl] == index - for lvl, index in enumerate(key, level)] - sdf = self._internal.sdf \ - .select(cols) \ - .where(reduce(lambda x, y: x & y, rows)) + cols = ( + self._internal.index_scols[:level] + + self._internal.index_scols[level + len(key) :] + + [self._internal.scol_for(self._internal.column_labels[0])] + ) + rows = [self._internal.scols[lvl] == index for lvl, index in enumerate(key, level)] + sdf = self._internal.sdf.select(cols).where(reduce(lambda x, y: x & y, rows)) if len(self._internal._index_map) == len(key): # if sdf has one column and one data, return data only without frame @@ -4095,8 +4235,8 @@ def xs(self, key, level=None): index_cols = [col for col in sdf.columns if col not in self._internal.data_columns] index_map_dict = dict(self._internal.index_map) internal = self._internal.copy( - sdf=sdf, - index_map=[(index_col, index_map_dict[index_col]) for index_col in index_cols]) + sdf=sdf, index_map=[(index_col, index_map_dict[index_col]) for index_col in index_cols] + ) return _col(DataFrame(internal)) @@ -4156,9 +4296,11 @@ def pct_change(self, periods=1): def _cum(self, func, skipna, part_cols=()): # This is used to cummin, cummax, cumsum, etc. - window = Window.orderBy( - NATURAL_ORDER_COLUMN_NAME).partitionBy(*part_cols).rowsBetween( - Window.unboundedPreceding, Window.currentRow) + window = ( + Window.orderBy(NATURAL_ORDER_COLUMN_NAME) + .partitionBy(*part_cols) + .rowsBetween(Window.unboundedPreceding, Window.currentRow) + ) if skipna: # There is a behavior difference between pandas and PySpark. In case of cummax, @@ -4189,7 +4331,8 @@ def _cum(self, func, skipna, part_cols=()): scol = F.when( # Manually sets nulls given the column defined above. - self._scol.isNull(), F.lit(None) + self._scol.isNull(), + F.lit(None), ).otherwise(func(self._scol).over(window)) else: # Here, we use two Windows. @@ -4225,7 +4368,7 @@ def _cum(self, func, skipna, part_cols=()): # By going through with max, it sets True after the first time it meets null. F.max(self._scol.isNull()).over(window), # Manually sets nulls given the column defined above. - F.lit(None) + F.lit(None), ).otherwise(func(self._scol).over(window)) return self._with_new_scol(scol).rename(self.name) @@ -4236,8 +4379,9 @@ def _cumprod(self, skipna, part_cols=()): def cumprod(scol): @pandas_udf(returnType=self.spark_type) def negative_check(s): - assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), \ + assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), ( "values should be bigger than 0: %s" % s + ) return s return F.sum(F.log(negative_check(scol))) @@ -4268,17 +4412,18 @@ def _reduce_for_stat_function(self, sfun, name, axis=None, numeric_only=None): numeric_only : not used by this implementation, but passed down by stats functions """ from inspect import signature + axis = validate_axis(axis) if axis == 1: raise ValueError("Series does not support columns axis.") num_args = len(signature(sfun).parameters) col_sdf = self._scol col_type = self.spark_type - if isinstance(col_type, BooleanType) and sfun.__name__ not in ('min', 'max'): + if isinstance(col_type, BooleanType) and sfun.__name__ not in ("min", "max"): # Stat functions cannot be used with boolean values by default # Thus, cast to integer (true to 1 and false to 0) # Exclude the min and max methods though since those work with booleans - col_sdf = col_sdf.cast('integer') + col_sdf = col_sdf.cast("integer") if num_args == 1: # Only pass in the column if sfun accepts only one arg col_sdf = sfun(col_sdf) @@ -4295,8 +4440,11 @@ def __getitem__(self, key): try: return self.loc[key] except SparkPandasIndexingError: - raise KeyError("Key length ({}) exceeds index depth ({})" - .format(len(key), len(self._internal.index_map))) + raise KeyError( + "Key length ({}) exceeds index depth ({})".format( + len(key), len(self._internal.index_map) + ) + ) def __getattr__(self, item: str_type) -> Any: if item.startswith("__"): @@ -4332,8 +4480,9 @@ def __repr__(self): if match is not None: length = match.group("length") name = str(self.dtype.name) - footer = ("\nName: {name}, dtype: {dtype}\nShowing only the first {length}" - .format(length=length, name=self.name, dtype=pprint_thing(name))) + footer = "\nName: {name}, dtype: {dtype}\nShowing only the first {length}".format( + length=length, name=self.name, dtype=pprint_thing(name) + ) return rest + footer return pser.to_string(name=self.name, dtype=self.dtype) @@ -4341,13 +4490,13 @@ def __dir__(self): if not isinstance(self.spark_type, StructType): fields = [] else: - fields = [f for f in self.spark_type.fieldNames() if ' ' not in f] + fields = [f for f in self.spark_type.fieldNames() if " " not in f] return super(Series, self).__dir__() + fields def __iter__(self): return _MissingPandasLikeSeries.__iter__(self) - def _equals(self, other: 'Series') -> bool: + def _equals(self, other: "Series") -> bool: return self._scol._jc.equals(other._scol._jc) diff --git a/databricks/koalas/sql.py b/databricks/koalas/sql.py index 46c9a03..40b7429 100644 --- a/databricks/koalas/sql.py +++ b/databricks/koalas/sql.py @@ -167,6 +167,7 @@ def _get_ipython_scope(): """ try: from IPython import get_ipython + shell = get_ipython() return shell.user_ns except Exception as e: @@ -177,11 +178,11 @@ def _get_ipython_scope(): # Originally from pymysql package _escape_table = [chr(x) for x in range(128)] -_escape_table[0] = u'\\0' -_escape_table[ord('\\')] = u'\\\\' -_escape_table[ord('\n')] = u'\\n' -_escape_table[ord('\r')] = u'\\r' -_escape_table[ord('\032')] = u'\\Z' +_escape_table[0] = u"\\0" +_escape_table[ord("\\")] = u"\\\\" +_escape_table[ord("\n")] = u"\\n" +_escape_table[ord("\r")] = u"\\r" +_escape_table[ord("\032")] = u"\\Z" _escape_table[ord('"')] = u'\\"' _escape_table[ord("'")] = u"\\'" @@ -202,7 +203,6 @@ def escape_sql_string(value: str) -> str: class SQLProcessor(object): - def __init__(self, scope: Dict[str, Any], statement: str, session: SparkSession): self._scope = scope self._statement = statement @@ -266,8 +266,10 @@ def _convert(self, key) -> Any: return self._cached_vars[key] # Analyze: if key not in self._scope: - raise ValueError("The key {} in the SQL statement was not found in global," - " local or parameters variables".format(key)) + raise ValueError( + "The key {} in the SQL statement was not found in global," + " local or parameters variables".format(key) + ) var = self._scope[key] fillin = self._convert_var(var) self._cached_vars[key] = fillin diff --git a/databricks/koalas/strings.py b/databricks/koalas/strings.py index 0a536d8..c9ddd91 100644 --- a/databricks/koalas/strings.py +++ b/databricks/koalas/strings.py @@ -31,16 +31,15 @@ class StringMethods(object): """String methods for Koalas Series""" - def __init__(self, series: 'ks.Series'): + + def __init__(self, series: "ks.Series"): if not isinstance(series.spark_type, (StringType, BinaryType, ArrayType)): - raise ValueError( - "Cannot call StringMethods on type {}" - .format(series.spark_type)) + raise ValueError("Cannot call StringMethods on type {}".format(series.spark_type)) self._data = series self.name = self._data.name # Methods - def capitalize(self) -> 'ks.Series': + def capitalize(self) -> "ks.Series": """ Convert Strings in the series to be capitalized. @@ -61,13 +60,11 @@ def capitalize(self) -> 'ks.Series': 3 Swapcase Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.capitalize(), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.capitalize(), StringType()).alias( + self.name + ) - def title(self) -> 'ks.Series': + def title(self) -> "ks.Series": """ Convert Strings in the series to be titlecase. @@ -88,13 +85,9 @@ def title(self) -> 'ks.Series': 3 Swapcase Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.title(), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.title(), StringType()).alias(self.name) - def lower(self) -> 'ks.Series': + def lower(self) -> "ks.Series": """ Convert strings in the Series/Index to all lowercase. @@ -115,13 +108,9 @@ def lower(self) -> 'ks.Series': 3 swapcase Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.lower(), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.lower(), StringType()).alias(self.name) - def upper(self) -> 'ks.Series': + def upper(self) -> "ks.Series": """ Convert strings in the Series/Index to all uppercase. @@ -142,13 +131,9 @@ def upper(self) -> 'ks.Series': 3 SWAPCASE Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.upper(), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.upper(), StringType()).alias(self.name) - def swapcase(self) -> 'ks.Series': + def swapcase(self) -> "ks.Series": """ Convert strings in the Series/Index to be swapcased. @@ -169,13 +154,11 @@ def swapcase(self) -> 'ks.Series': 3 sWaPcAsE Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.swapcase(), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.swapcase(), StringType()).alias( + self.name + ) - def startswith(self, pattern, na=None) -> 'ks.Series': + def startswith(self, pattern, na=None) -> "ks.Series": """ Test if the start of each string element matches a pattern. @@ -221,12 +204,10 @@ def startswith(self, pattern, na=None) -> 'ks.Series': Name: 0, dtype: bool """ return _wrap_accessor_pandas( - self, - lambda x: x.str.startswith(pattern, na), - BooleanType() + self, lambda x: x.str.startswith(pattern, na), BooleanType() ).alias(self.name) - def endswith(self, pattern, na=None) -> 'ks.Series': + def endswith(self, pattern, na=None) -> "ks.Series": """ Test if the end of each string element matches a pattern. @@ -272,12 +253,10 @@ def endswith(self, pattern, na=None) -> 'ks.Series': Name: 0, dtype: bool """ return _wrap_accessor_pandas( - self, - lambda x: x.str.endswith(pattern, na), - BooleanType() + self, lambda x: x.str.endswith(pattern, na), BooleanType() ).alias(self.name) - def strip(self, to_strip=None) -> 'ks.Series': + def strip(self, to_strip=None) -> "ks.Series": """ Remove leading and trailing characters. @@ -323,13 +302,11 @@ def strip(self, to_strip=None) -> 'ks.Series': 2 None Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.strip(to_strip), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.strip(to_strip), StringType()).alias( + self.name + ) - def lstrip(self, to_strip=None) -> 'ks.Series': + def lstrip(self, to_strip=None) -> "ks.Series": """ Remove leading characters. @@ -363,13 +340,11 @@ def lstrip(self, to_strip=None) -> 'ks.Series': 2 None Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.lstrip(to_strip), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.lstrip(to_strip), StringType()).alias( + self.name + ) - def rstrip(self, to_strip=None) -> 'ks.Series': + def rstrip(self, to_strip=None) -> "ks.Series": """ Remove trailing characters. @@ -403,13 +378,11 @@ def rstrip(self, to_strip=None) -> 'ks.Series': 2 None Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.rstrip(to_strip), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.rstrip(to_strip), StringType()).alias( + self.name + ) - def get(self, i) -> 'ks.Series': + def get(self, i) -> "ks.Series": """ Extract element from each string or string list/tuple in the Series at the specified position. @@ -457,13 +430,9 @@ def get(self, i) -> 'ks.Series': 1 None Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.get(i), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.get(i), StringType()).alias(self.name) - def isalnum(self) -> 'ks.Series': + def isalnum(self) -> "ks.Series": """ Check whether all characters in each string are alphanumeric. @@ -493,13 +462,11 @@ def isalnum(self) -> 'ks.Series': 2 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.isalnum(), - BooleanType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.isalnum(), BooleanType()).alias( + self.name + ) - def isalpha(self) -> 'ks.Series': + def isalpha(self) -> "ks.Series": """ Check whether all characters in each string are alphabetic. @@ -518,13 +485,11 @@ def isalpha(self) -> 'ks.Series': 3 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.isalpha(), - BooleanType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.isalpha(), BooleanType()).alias( + self.name + ) - def isdigit(self) -> 'ks.Series': + def isdigit(self) -> "ks.Series": """ Check whether all characters in each string are digits. @@ -568,13 +533,11 @@ def isdigit(self) -> 'ks.Series': 3 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.isdigit(), - BooleanType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.isdigit(), BooleanType()).alias( + self.name + ) - def isspace(self) -> 'ks.Series': + def isspace(self) -> "ks.Series": """ Check whether all characters in each string are whitespaces. @@ -591,13 +554,11 @@ def isspace(self) -> 'ks.Series': 2 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.isspace(), - BooleanType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.isspace(), BooleanType()).alias( + self.name + ) - def islower(self) -> 'ks.Series': + def islower(self) -> "ks.Series": """ Check whether all characters in each string are lowercase. @@ -615,13 +576,11 @@ def islower(self) -> 'ks.Series': 3 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.islower(), - BooleanType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.islower(), BooleanType()).alias( + self.name + ) - def isupper(self) -> 'ks.Series': + def isupper(self) -> "ks.Series": """ Check whether all characters in each string are uppercase. @@ -639,13 +598,11 @@ def isupper(self) -> 'ks.Series': 3 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.isupper(), - BooleanType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.isupper(), BooleanType()).alias( + self.name + ) - def istitle(self) -> 'ks.Series': + def istitle(self) -> "ks.Series": """ Check whether all characters in each string are titlecase. @@ -669,13 +626,11 @@ def istitle(self) -> 'ks.Series': 3 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.istitle(), - BooleanType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.istitle(), BooleanType()).alias( + self.name + ) - def isnumeric(self) -> 'ks.Series': + def isnumeric(self) -> "ks.Series": """ Check whether all characters in each string are numeric. @@ -727,13 +682,11 @@ def isnumeric(self) -> 'ks.Series': 3 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.isnumeric(), - BooleanType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.isnumeric(), BooleanType()).alias( + self.name + ) - def isdecimal(self) -> 'ks.Series': + def isdecimal(self) -> "ks.Series": """ Check whether all characters in each string are decimals. @@ -777,19 +730,17 @@ def isdecimal(self) -> 'ks.Series': 3 False Name: 0, dtype: bool """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.isdecimal(), - BooleanType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.isdecimal(), BooleanType()).alias( + self.name + ) - def cat(self, others=None, sep=None, na_rep=None, join=None) -> 'ks.Series': + def cat(self, others=None, sep=None, na_rep=None, join=None) -> "ks.Series": """ Not supported. """ raise NotImplementedError() - def center(self, width, fillchar=' ') -> 'ks.Series': + def center(self, width, fillchar=" ") -> "ks.Series": """ Filling left and right side of strings in the Series/Index with an additional character. Equivalent to :func:`str.center`. @@ -820,12 +771,10 @@ def center(self, width, fillchar=' ') -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.str.center(width, fillchar), - StringType() + self, lambda x: x.str.center(width, fillchar), StringType() ).alias(self.name) - def contains(self, pat, case=True, flags=0, na=None, regex=True) -> 'ks.Series': + def contains(self, pat, case=True, flags=0, na=None, regex=True) -> "ks.Series": """ Test if pattern or regex is contained within a string of a Series. @@ -937,12 +886,10 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True) -> 'ks.Series': Name: 0, dtype: bool """ return _wrap_accessor_pandas( - self, - lambda x: x.str.contains(pat, case, flags, na, regex), - BooleanType() + self, lambda x: x.str.contains(pat, case, flags, na, regex), BooleanType() ).alias(self.name) - def count(self, pat, flags=0) -> 'ks.Series': + def count(self, pat, flags=0) -> "ks.Series": """ Count occurrences of pattern in each string of the Series. @@ -986,37 +933,35 @@ def count(self, pat, flags=0) -> 'ks.Series': 5 0 Name: 0, dtype: int32 """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.count(pat, flags), - IntegerType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.count(pat, flags), IntegerType()).alias( + self.name + ) - def decode(self, encoding, errors='strict') -> 'ks.Series': + def decode(self, encoding, errors="strict") -> "ks.Series": """ Not supported. """ raise NotImplementedError() - def encode(self, encoding, errors='strict') -> 'ks.Series': + def encode(self, encoding, errors="strict") -> "ks.Series": """ Not supported. """ raise NotImplementedError() - def extract(self, pat, flags=0, expand=True) -> 'ks.Series': + def extract(self, pat, flags=0, expand=True) -> "ks.Series": """ Not supported. """ raise NotImplementedError() - def extractall(self, pat, flags=0) -> 'ks.Series': + def extractall(self, pat, flags=0) -> "ks.Series": """ Not supported. """ raise NotImplementedError() - def find(self, sub, start=0, end=None) -> 'ks.Series': + def find(self, sub, start=0, end=None) -> "ks.Series": """ Return lowest indexes in each strings in the Series where the substring is fully contained between [start:end]. @@ -1066,12 +1011,10 @@ def find(self, sub, start=0, end=None) -> 'ks.Series': Name: 0, dtype: int32 """ return _wrap_accessor_pandas( - self, - lambda x: x.str.find(sub, start, end), - IntegerType() + self, lambda x: x.str.find(sub, start, end), IntegerType() ).alias(self.name) - def findall(self, pat, flags=0) -> 'ks.Series': + def findall(self, pat, flags=0) -> "ks.Series": """ Find all occurrences of pattern or regular expression in the Series. @@ -1150,12 +1093,10 @@ def findall(self, pat, flags=0) -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.str.findall(pat, flags), - ArrayType(StringType(), containsNull=True) + self, lambda x: x.str.findall(pat, flags), ArrayType(StringType(), containsNull=True) ).alias(self.name) - def index(self, sub, start=0, end=None) -> 'ks.Series': + def index(self, sub, start=0, end=None) -> "ks.Series": """ Return lowest indexes in each strings where the substring is fully contained between [start:end]. @@ -1193,12 +1134,10 @@ def index(self, sub, start=0, end=None) -> 'ks.Series': >>> s.str.index('a', start=2) # doctest: +SKIP """ return _wrap_accessor_pandas( - self, - lambda x: x.str.index(sub, start, end), - LongType() + self, lambda x: x.str.index(sub, start, end), LongType() ).alias(self.name) - def join(self, sep) -> 'ks.Series': + def join(self, sep) -> "ks.Series": """ Join lists contained as elements in the Series with passed delimiter. @@ -1233,13 +1172,9 @@ def join(self, sep) -> 'ks.Series': 1 None Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.join(sep), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.join(sep), StringType()).alias(self.name) - def len(self) -> 'ks.Series': + def len(self) -> "ks.Series": """ Computes the length of each element in the Series. @@ -1268,13 +1203,9 @@ def len(self) -> 'ks.Series': 1 0 Name: 0, dtype: int64 """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.len(), - LongType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.len(), LongType()).alias(self.name) - def ljust(self, width, fillchar=' ') -> 'ks.Series': + def ljust(self, width, fillchar=" ") -> "ks.Series": """ Filling right side of strings in the Series with an additional character. Equivalent to :func:`str.ljust`. @@ -1305,12 +1236,10 @@ def ljust(self, width, fillchar=' ') -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.str.ljust(width, fillchar), - StringType() + self, lambda x: x.str.ljust(width, fillchar), StringType() ).alias(self.name) - def match(self, pat, case=True, flags=0, na=np.NaN) -> 'ks.Series': + def match(self, pat, case=True, flags=0, na=np.NaN) -> "ks.Series": """ Determine if each string matches a regular expression. @@ -1371,12 +1300,10 @@ def match(self, pat, case=True, flags=0, na=np.NaN) -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.str.match(pat, case, flags, na), - BooleanType() + self, lambda x: x.str.match(pat, case, flags, na), BooleanType() ).alias(self.name) - def normalize(self, form) -> 'ks.Series': + def normalize(self, form) -> "ks.Series": """ Return the Unicode normal form for the strings in the Series. @@ -1393,13 +1320,11 @@ def normalize(self, form) -> 'ks.Series': Series of objects A Series of normalized strings. """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.normalize(form), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.normalize(form), StringType()).alias( + self.name + ) - def pad(self, width, side='left', fillchar=' ') -> 'ks.Series': + def pad(self, width, side="left", fillchar=" ") -> "ks.Series": """ Pad strings in the Series up to width. @@ -1442,18 +1367,16 @@ def pad(self, width, side='left', fillchar=' ') -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.str.pad(width, side, fillchar), - StringType() + self, lambda x: x.str.pad(width, side, fillchar), StringType() ).alias(self.name) - def partition(self, sep=' ', expand=True) -> 'ks.Series': + def partition(self, sep=" ", expand=True) -> "ks.Series": """ Not supported. """ raise NotImplementedError() - def repeat(self, repeats) -> 'ks.Series': + def repeat(self, repeats) -> "ks.Series": """ Duplicate each string in the Series. @@ -1490,12 +1413,10 @@ def repeat(self, repeats) -> 'ks.Series': raise ValueError("repeats expects an int parameter") return _wrap_accessor_pandas( - self, - lambda x: x.str.repeat(repeats=repeats), - StringType() + self, lambda x: x.str.repeat(repeats=repeats), StringType() ).alias(self.name) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True) -> 'ks.Series': + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True) -> "ks.Series": """ Replace occurrences of pattern/regex in the Series with some other string. Equivalent to :func:`str.replace` or :func:`re.sub`. @@ -1585,13 +1506,11 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True) -> 'ks.Series """ return _wrap_accessor_pandas( self, - lambda x: x.str.replace( - pat, repl, n=n, case=case, flags=flags, regex=regex - ), - StringType() + lambda x: x.str.replace(pat, repl, n=n, case=case, flags=flags, regex=regex), + StringType(), ).alias(self.name) - def rfind(self, sub, start=0, end=None) -> 'ks.Series': + def rfind(self, sub, start=0, end=None) -> "ks.Series": """ Return highest indexes in each strings in the Series where the substring is fully contained between [start:end]. @@ -1641,12 +1560,10 @@ def rfind(self, sub, start=0, end=None) -> 'ks.Series': Name: 0, dtype: int32 """ return _wrap_accessor_pandas( - self, - lambda x: x.str.rfind(sub, start, end), - IntegerType() + self, lambda x: x.str.rfind(sub, start, end), IntegerType() ).alias(self.name) - def rindex(self, sub, start=0, end=None) -> 'ks.Series': + def rindex(self, sub, start=0, end=None) -> "ks.Series": """ Return highest indexes in each strings where the substring is fully contained between [start:end]. @@ -1684,12 +1601,10 @@ def rindex(self, sub, start=0, end=None) -> 'ks.Series': >>> s.str.rindex('a', start=2) # doctest: +SKIP """ return _wrap_accessor_pandas( - self, - lambda x: x.str.rindex(sub, start, end), - LongType() + self, lambda x: x.str.rindex(sub, start, end), LongType() ).alias(self.name) - def rjust(self, width, fillchar=' ') -> 'ks.Series': + def rjust(self, width, fillchar=" ") -> "ks.Series": """ Filling left side of strings in the Series with an additional character. Equivalent to :func:`str.rjust`. @@ -1725,18 +1640,16 @@ def rjust(self, width, fillchar=' ') -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.str.rjust(width, fillchar), - StringType() + self, lambda x: x.str.rjust(width, fillchar), StringType() ).alias(self.name) - def rpartition(self, sep=' ', expand=True) -> 'ks.Series': + def rpartition(self, sep=" ", expand=True) -> "ks.Series": """ Not supported. """ raise NotImplementedError() - def slice(self, start=None, stop=None, step=None) -> 'ks.Series': + def slice(self, start=None, stop=None, step=None) -> "ks.Series": """ Slice substrings from each element in the Series. @@ -1788,12 +1701,10 @@ def slice(self, start=None, stop=None, step=None) -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.str.slice(start, stop, step), - StringType() + self, lambda x: x.str.slice(start, stop, step), StringType() ).alias(self.name) - def slice_replace(self, start=None, stop=None, repl=None) -> 'ks.Series': + def slice_replace(self, start=None, stop=None, repl=None) -> "ks.Series": """ Slice substrings from each element in the Series. @@ -1862,12 +1773,10 @@ def slice_replace(self, start=None, stop=None, repl=None) -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.str.slice_replace(start, stop, repl), - StringType() + self, lambda x: x.str.slice_replace(start, stop, repl), StringType() ).alias(self.name) - def split(self, pat=None, n=-1, expand=False) -> 'ks.Series': + def split(self, pat=None, n=-1, expand=False) -> "ks.Series": """ Split strings around given separator/delimiter. @@ -1939,12 +1848,10 @@ def split(self, pat=None, n=-1, expand=False) -> 'ks.Series': raise NotImplementedError("expand=True is currently not supported.") return _wrap_accessor_pandas( - self, - lambda x: x.str.split(pat, n, expand), - ArrayType(StringType(), containsNull=True) + self, lambda x: x.str.split(pat, n, expand), ArrayType(StringType(), containsNull=True) ).alias(self.name) - def rsplit(self, pat=None, n=-1, expand=False) -> 'ks.Series': + def rsplit(self, pat=None, n=-1, expand=False) -> "ks.Series": """ Split strings around given separator/delimiter. @@ -2008,12 +1915,10 @@ def rsplit(self, pat=None, n=-1, expand=False) -> 'ks.Series': raise NotImplementedError("expand=True is currently not supported.") return _wrap_accessor_pandas( - self, - lambda x: x.str.rsplit(pat, n, expand), - ArrayType(StringType(), containsNull=True) + self, lambda x: x.str.rsplit(pat, n, expand), ArrayType(StringType(), containsNull=True) ).alias(self.name) - def translate(self, table) -> 'ks.Series': + def translate(self, table) -> "ks.Series": """ Map all characters in the string through the given mapping table. Equivalent to standard :func:`str.translate`. @@ -2041,13 +1946,11 @@ def translate(self, table) -> 'ks.Series': 2 bYrd Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.translate(table), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.translate(table), StringType()).alias( + self.name + ) - def wrap(self, width, **kwargs) -> 'ks.Series': + def wrap(self, width, **kwargs) -> "ks.Series": """ Wrap long strings in the Series to be formatted in paragraphs with length less than a given width. @@ -2093,12 +1996,10 @@ def wrap(self, width, **kwargs) -> 'ks.Series': Name: 0, dtype: object """ return _wrap_accessor_pandas( - self, - lambda x: x.str.wrap(width, **kwargs), - StringType() + self, lambda x: x.str.wrap(width, **kwargs), StringType() ).alias(self.name) - def zfill(self, width) -> 'ks.Series': + def zfill(self, width) -> "ks.Series": """ Pad strings in the Series by prepending ‘0’ characters. @@ -2142,13 +2043,11 @@ def zfill(self, width) -> 'ks.Series': 3 None Name: 0, dtype: object """ - return _wrap_accessor_pandas( - self, - lambda x: x.str.zfill(width), - StringType() - ).alias(self.name) + return _wrap_accessor_pandas(self, lambda x: x.str.zfill(width), StringType()).alias( + self.name + ) - def get_dummies(self, sep='|'): + def get_dummies(self, sep="|"): """ Not supported. """ diff --git a/databricks/koalas/testing/utils.py b/databricks/koalas/testing/utils.py index 3cec82a..11a4fed 100644 --- a/databricks/koalas/testing/utils.py +++ b/databricks/koalas/testing/utils.py @@ -118,11 +118,10 @@ def function(self, *functions): class ReusedSQLTestCase(unittest.TestCase, SQLTestUtils): - @classmethod def setUpClass(cls): cls.spark = default_session() - cls.spark.conf.set('spark.sql.execution.arrow.enabled', True) + cls.spark.conf.set("spark.sql.execution.arrow.enabled", True) @classmethod def tearDownClass(cls): @@ -133,19 +132,25 @@ def tearDownClass(cls): def assertPandasEqual(self, left, right): if isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame): - msg = ("DataFrames are not equal: " + - "\n\nLeft:\n%s\n%s" % (left, left.dtypes) + - "\n\nRight:\n%s\n%s" % (right, right.dtypes)) + msg = ( + "DataFrames are not equal: " + + "\n\nLeft:\n%s\n%s" % (left, left.dtypes) + + "\n\nRight:\n%s\n%s" % (right, right.dtypes) + ) self.assertTrue(left.equals(right), msg=msg) elif isinstance(left, pd.Series) and isinstance(right, pd.Series): - msg = ("Series are not equal: " + - "\n\nLeft:\n%s\n%s" % (left, left.dtype) + - "\n\nRight:\n%s\n%s" % (right, right.dtype)) + msg = ( + "Series are not equal: " + + "\n\nLeft:\n%s\n%s" % (left, left.dtype) + + "\n\nRight:\n%s\n%s" % (right, right.dtype) + ) self.assertTrue((left == right).all(), msg=msg) elif isinstance(left, pd.Index) and isinstance(right, pd.Index): - msg = ("Indices are not equal: " + - "\n\nLeft:\n%s\n%s" % (left, left.dtype) + - "\n\nRight:\n%s\n%s" % (right, right.dtype)) + msg = ( + "Indices are not equal: " + + "\n\nLeft:\n%s\n%s" % (left, left.dtype) + + "\n\nRight:\n%s\n%s" % (right, right.dtype) + ) self.assertTrue((left == right).all(), msg=msg) else: raise ValueError("Unexpected values: (%s, %s)" % (left, right)) @@ -159,9 +164,11 @@ def assertPandasAlmostEqual(self, left, right): dropping missing values (NaN, NaT, None) """ if isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame): - msg = ("DataFrames are not almost equal: " + - "\n\nLeft:\n%s\n%s" % (left, left.dtypes) + - "\n\nRight:\n%s\n%s" % (right, right.dtypes)) + msg = ( + "DataFrames are not almost equal: " + + "\n\nLeft:\n%s\n%s" % (left, left.dtypes) + + "\n\nRight:\n%s\n%s" % (right, right.dtypes) + ) self.assertEqual(left.shape, right.shape, msg=msg) for lcol, rcol in zip(left.columns, right.columns): self.assertEqual(name_like_string(lcol), name_like_string(rcol), msg=msg) @@ -170,18 +177,22 @@ def assertPandasAlmostEqual(self, left, right): for lval, rval in zip(left[lcol].dropna(), right[rcol].dropna()): self.assertAlmostEqual(lval, rval, msg=msg) elif isinstance(left, pd.Series) and isinstance(left, pd.Series): - msg = ("Series are not almost equal: " + - "\n\nLeft:\n%s\n%s" % (left, left.dtype) + - "\n\nRight:\n%s\n%s" % (right, right.dtype)) + msg = ( + "Series are not almost equal: " + + "\n\nLeft:\n%s\n%s" % (left, left.dtype) + + "\n\nRight:\n%s\n%s" % (right, right.dtype) + ) self.assertEqual(len(left), len(right), msg=msg) for lnull, rnull in zip(left.isnull(), right.isnull()): self.assertEqual(lnull, rnull, msg=msg) for lval, rval in zip(left.dropna(), right.dropna()): self.assertAlmostEqual(lval, rval, msg=msg) elif isinstance(left, pd.Index) and isinstance(left, pd.Index): - msg = ("Indices are not almost equal: " + - "\n\nLeft:\n%s\n%s" % (left, left.dtype) + - "\n\nRight:\n%s\n%s" % (right, right.dtype)) + msg = ( + "Indices are not almost equal: " + + "\n\nLeft:\n%s\n%s" % (left, left.dtype) + + "\n\nRight:\n%s\n%s" % (right, right.dtype) + ) self.assertEqual(len(left), len(right), msg=msg) for lnull, rnull in zip(left.isnull(), right.isnull()): self.assertEqual(lnull, rnull, msg=msg) @@ -229,7 +240,6 @@ def _to_pandas(df): class TestUtils(object): - @contextmanager def temp_dir(self): tmp = tempfile.mkdtemp() @@ -245,7 +255,6 @@ def temp_file(self): class ComparisonTestBase(ReusedSQLTestCase): - @property def kdf(self): return ks.from_pandas(self.pdf) diff --git a/databricks/koalas/tests/test_config.py b/databricks/koalas/tests/test_config.py index 00c6561..5777443 100644 --- a/databricks/koalas/tests/test_config.py +++ b/databricks/koalas/tests/test_config.py @@ -21,80 +21,86 @@ class ConfigTest(ReusedSQLTestCase): - def setUp(self): - config._options_dict['test.config'] = Option(key='test.config', doc="", default="default") - - config._options_dict['test.config.list'] = Option( - key='test.config.list', doc="", default=[], types=list) - config._options_dict['test.config.float'] = Option( - key='test.config.float', doc="", default=1.2, types=float) - - config._options_dict['test.config.int'] = Option( - key='test.config.int', doc="", default=1, - types=int, check_func=(lambda v: v > 0, "bigger then 0")) - config._options_dict['test.config.int.none'] = Option( - key='test.config.int', doc="", default=None, types=(int, type(None))) + config._options_dict["test.config"] = Option(key="test.config", doc="", default="default") + + config._options_dict["test.config.list"] = Option( + key="test.config.list", doc="", default=[], types=list + ) + config._options_dict["test.config.float"] = Option( + key="test.config.float", doc="", default=1.2, types=float + ) + + config._options_dict["test.config.int"] = Option( + key="test.config.int", + doc="", + default=1, + types=int, + check_func=(lambda v: v > 0, "bigger then 0"), + ) + config._options_dict["test.config.int.none"] = Option( + key="test.config.int", doc="", default=None, types=(int, type(None)) + ) def tearDown(self): - ks.reset_option('test.config') - del config._options_dict['test.config'] - del config._options_dict['test.config.list'] - del config._options_dict['test.config.float'] - del config._options_dict['test.config.int'] - del config._options_dict['test.config.int.none'] + ks.reset_option("test.config") + del config._options_dict["test.config"] + del config._options_dict["test.config.list"] + del config._options_dict["test.config.float"] + del config._options_dict["test.config.int"] + del config._options_dict["test.config.int.none"] def test_get_set_reset_option(self): - self.assertEqual(ks.get_option('test.config'), 'default') + self.assertEqual(ks.get_option("test.config"), "default") - ks.set_option('test.config', 'value') - self.assertEqual(ks.get_option('test.config'), 'value') + ks.set_option("test.config", "value") + self.assertEqual(ks.get_option("test.config"), "value") - ks.reset_option('test.config') - self.assertEqual(ks.get_option('test.config'), 'default') + ks.reset_option("test.config") + self.assertEqual(ks.get_option("test.config"), "default") def test_get_set_reset_option_different_types(self): - ks.set_option('test.config.list', [1, 2, 3, 4]) - self.assertEqual(ks.get_option('test.config.list'), [1, 2, 3, 4]) + ks.set_option("test.config.list", [1, 2, 3, 4]) + self.assertEqual(ks.get_option("test.config.list"), [1, 2, 3, 4]) - ks.set_option('test.config.float', 5.0) - self.assertEqual(ks.get_option('test.config.float'), 5.0) + ks.set_option("test.config.float", 5.0) + self.assertEqual(ks.get_option("test.config.float"), 5.0) - ks.set_option('test.config.int', 123) - self.assertEqual(ks.get_option('test.config.int'), 123) + ks.set_option("test.config.int", 123) + self.assertEqual(ks.get_option("test.config.int"), 123) - self.assertEqual(ks.get_option('test.config.int.none'), None) # default None - ks.set_option('test.config.int.none', 123) - self.assertEqual(ks.get_option('test.config.int.none'), 123) - ks.set_option('test.config.int.none', None) - self.assertEqual(ks.get_option('test.config.int.none'), None) + self.assertEqual(ks.get_option("test.config.int.none"), None) # default None + ks.set_option("test.config.int.none", 123) + self.assertEqual(ks.get_option("test.config.int.none"), 123) + ks.set_option("test.config.int.none", None) + self.assertEqual(ks.get_option("test.config.int.none"), None) def test_different_types(self): with self.assertRaisesRegex(ValueError, "was "): - ks.set_option('test.config.list', 1) + ks.set_option("test.config.list", 1) with self.assertRaisesRegex(ValueError, "however, expected types are"): - ks.set_option('test.config.float', 'abc') + ks.set_option("test.config.float", "abc") with self.assertRaisesRegex(ValueError, "[]"): - ks.set_option('test.config.int', 'abc') + ks.set_option("test.config.int", "abc") with self.assertRaisesRegex(ValueError, "(, )"): - ks.set_option('test.config.int.none', 'abc') + ks.set_option("test.config.int.none", "abc") def test_check_func(self): with self.assertRaisesRegex(ValueError, "bigger then 0"): - ks.set_option('test.config.int', -1) + ks.set_option("test.config.int", -1) def test_unknown_option(self): - with self.assertRaisesRegex(config.OptionError, 'No such option'): - ks.get_option('unknown') + with self.assertRaisesRegex(config.OptionError, "No such option"): + ks.get_option("unknown") with self.assertRaisesRegex(config.OptionError, "Available options"): - ks.set_option('unknown', 'value') + ks.set_option("unknown", "value") with self.assertRaisesRegex(config.OptionError, "test.config"): - ks.reset_option('unknown') + ks.reset_option("unknown") def test_namespace_access(self): try: @@ -110,9 +116,11 @@ def test_namespace_access(self): self.assertRaisesRegex(config.OptionError, "No such option", lambda: ks.options.compu) self.assertRaisesRegex( - config.OptionError, "No such option", lambda: ks.options.compute.max) + config.OptionError, "No such option", lambda: ks.options.compute.max + ) self.assertRaisesRegex( - config.OptionError, "No such option", lambda: ks.options.max_rows1) + config.OptionError, "No such option", lambda: ks.options.max_rows1 + ) with self.assertRaisesRegex(config.OptionError, "No such option"): ks.options.compute.max = 0 diff --git a/databricks/koalas/tests/test_csv.py b/databricks/koalas/tests/test_csv.py index b5a717b..4d9a774 100644 --- a/databricks/koalas/tests/test_csv.py +++ b/databricks/koalas/tests/test_csv.py @@ -26,11 +26,10 @@ def normalize_text(s): - return '\n'.join(map(str.strip, s.strip().split('\n'))) + return "\n".join(map(str.strip, s.strip().split("\n"))) class CsvTest(ReusedSQLTestCase, TestUtils): - def setUp(self): self.tmp_dir = tempfile.mkdtemp(prefix=CsvTest.__name__) @@ -55,7 +54,8 @@ def csv_text(self): Frank,200 Alice,300 Edith,600 - """) + """ + ) @property def csv_text_2(self): @@ -66,7 +66,8 @@ def csv_text_2(self): item2,1,2 item3,1,2,3,4 item4,1 - """) + """ + ) @property def csv_text_with_comments(self): @@ -78,7 +79,9 @@ def csv_text_with_comments(self): Alice,400 Edith,600 # footer - """ % self.csv_text) + """ + % self.csv_text + ) @property def tab_delimited_csv_text(self): @@ -88,7 +91,8 @@ def tab_delimited_csv_text(self): Alice\t100 Bob\t-200 Charlie\t300 - """) + """ + ) @property def q_quoted_csv_text(self): @@ -98,7 +102,8 @@ def q_quoted_csv_text(self): QA,liceQ,Q100Q QB,obQ,Q-200Q QC,harlieQ,Q300Q - """) + """ + ) @property def e_escapeted_csv_text(self): @@ -108,19 +113,20 @@ def e_escapeted_csv_text(self): "AE"lice",100 "BE"ob",-200 "CE"harlie",300 - """) + """ + ) @contextmanager def csv_file(self, csv): with self.temp_file() as tmp: - with open(tmp, 'w') as f: + with open(tmp, "w") as f: f.write(csv) yield tmp def test_read_csv(self): with self.csv_file(self.csv_text) as fn: - def check(header='infer', names=None, usecols=None): + def check(header="infer", names=None, usecols=None): expected = pd.read_csv(fn, header=header, names=names, usecols=usecols) actual = ks.read_csv(fn, header=header, names=names, usecols=usecols) self.assertPandasAlmostEqual(expected, actual.toPandas()) @@ -128,129 +134,157 @@ def check(header='infer', names=None, usecols=None): check() check(header=None) check(header=0) - check(names=['n', 'a']) - check(header=0, names=['n', 'a']) + check(names=["n", "a"]) + check(header=0, names=["n", "a"]) check(usecols=[1]) check(usecols=[1, 0]) - check(usecols=['amount']) - check(usecols=['amount', 'name']) + check(usecols=["amount"]) + check(usecols=["amount", "name"]) check(usecols=[]) check(usecols=[1, 1]) - check(usecols=['amount', 'amount']) - check(names=['n', 'a'], usecols=['a']) + check(usecols=["amount", "amount"]) + check(names=["n", "a"], usecols=["a"]) # check with pyspark patch. expected = pd.read_csv(fn) actual = ks.read_csv(fn) self.assertPandasAlmostEqual(expected, actual.toPandas()) - self.assertRaisesRegex(ValueError, 'non-unique', - lambda: ks.read_csv(fn, names=['n', 'n'])) - self.assertRaisesRegex(ValueError, 'does not match the number.*3', - lambda: ks.read_csv(fn, names=['n', 'a', 'b'])) - self.assertRaisesRegex(ValueError, 'does not match the number.*3', - lambda: ks.read_csv(fn, header=0, names=['n', 'a', 'b'])) - self.assertRaisesRegex(ValueError, 'Usecols do not match.*3', - lambda: ks.read_csv(fn, usecols=[1, 3])) - self.assertRaisesRegex(ValueError, 'Usecols do not match.*col', - lambda: ks.read_csv(fn, usecols=['amount', 'col'])) - self.assertRaisesRegex(ValueError, 'Unknown header argument 1', - lambda: ks.read_csv(fn, header='1')) - expected_error_message = ("'usecols' must either be list-like of all strings, " - "all unicode, all integers or a callable.") - self.assertRaisesRegex(ValueError, expected_error_message, - lambda: ks.read_csv(fn, usecols=[1, 'amount'])) + self.assertRaisesRegex( + ValueError, "non-unique", lambda: ks.read_csv(fn, names=["n", "n"]) + ) + self.assertRaisesRegex( + ValueError, + "does not match the number.*3", + lambda: ks.read_csv(fn, names=["n", "a", "b"]), + ) + self.assertRaisesRegex( + ValueError, + "does not match the number.*3", + lambda: ks.read_csv(fn, header=0, names=["n", "a", "b"]), + ) + self.assertRaisesRegex( + ValueError, "Usecols do not match.*3", lambda: ks.read_csv(fn, usecols=[1, 3]) + ) + self.assertRaisesRegex( + ValueError, + "Usecols do not match.*col", + lambda: ks.read_csv(fn, usecols=["amount", "col"]), + ) + self.assertRaisesRegex( + ValueError, "Unknown header argument 1", lambda: ks.read_csv(fn, header="1") + ) + expected_error_message = ( + "'usecols' must either be list-like of all strings, " + "all unicode, all integers or a callable." + ) + self.assertRaisesRegex( + ValueError, expected_error_message, lambda: ks.read_csv(fn, usecols=[1, "amount"]) + ) # check with index_col - expected = pd.read_csv(fn).set_index('name') - actual = ks.read_csv(fn, index_col='name') + expected = pd.read_csv(fn).set_index("name") + actual = ks.read_csv(fn, index_col="name") self.assertPandasAlmostEqual(expected, actual.toPandas()) def test_read_with_spark_schema(self): with self.csv_file(self.csv_text_2) as fn: actual = ks.read_csv(fn, names="A string, B string, C long, D long, E long") - expected = pd.read_csv(fn, names=['A', 'B', 'C', 'D', 'E']) + expected = pd.read_csv(fn, names=["A", "B", "C", "D", "E"]) self.assertEqual(repr(expected), repr(actual)) def test_read_csv_with_comment(self): with self.csv_file(self.csv_text_with_comments) as fn: - expected = pd.read_csv(fn, comment='#') - actual = ks.read_csv(fn, comment='#') + expected = pd.read_csv(fn, comment="#") + actual = ks.read_csv(fn, comment="#") self.assertPandasAlmostEqual(expected, actual.toPandas()) - self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported', - lambda: ks.read_csv(fn, comment='').show()) - self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported', - lambda: ks.read_csv(fn, comment='##').show()) - self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported', - lambda: ks.read_csv(fn, comment=1)) - self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported', - lambda: ks.read_csv(fn, comment=[1])) + self.assertRaisesRegex( + ValueError, + "Only length-1 comment characters supported", + lambda: ks.read_csv(fn, comment="").show(), + ) + self.assertRaisesRegex( + ValueError, + "Only length-1 comment characters supported", + lambda: ks.read_csv(fn, comment="##").show(), + ) + self.assertRaisesRegex( + ValueError, + "Only length-1 comment characters supported", + lambda: ks.read_csv(fn, comment=1), + ) + self.assertRaisesRegex( + ValueError, + "Only length-1 comment characters supported", + lambda: ks.read_csv(fn, comment=[1]), + ) def test_read_csv_with_sep(self): with self.csv_file(self.tab_delimited_csv_text) as fn: - expected = pd.read_csv(fn, sep='\t') - actual = ks.read_csv(fn, sep='\t') + expected = pd.read_csv(fn, sep="\t") + actual = ks.read_csv(fn, sep="\t") self.assertPandasAlmostEqual(expected, actual.toPandas()) def test_read_csv_with_squeeze(self): with self.csv_file(self.csv_text) as fn: - expected = pd.read_csv(fn, squeeze=True, usecols=['name']) - actual = ks.read_csv(fn, squeeze=True, usecols=['name']) + expected = pd.read_csv(fn, squeeze=True, usecols=["name"]) + actual = ks.read_csv(fn, squeeze=True, usecols=["name"]) self.assertPandasAlmostEqual(expected, actual.toPandas()) - expected = pd.read_csv(fn, squeeze=True, usecols=['name', 'amount']) - actual = ks.read_csv(fn, squeeze=True, usecols=['name', 'amount']) + expected = pd.read_csv(fn, squeeze=True, usecols=["name", "amount"]) + actual = ks.read_csv(fn, squeeze=True, usecols=["name", "amount"]) self.assertPandasAlmostEqual(expected, actual.toPandas()) def test_read_csv_with_mangle_dupe_cols(self): - self.assertRaisesRegex(ValueError, 'mangle_dupe_cols', - lambda: ks.read_csv('path', mangle_dupe_cols=False)) + self.assertRaisesRegex( + ValueError, "mangle_dupe_cols", lambda: ks.read_csv("path", mangle_dupe_cols=False) + ) def test_read_csv_with_parse_dates(self): - self.assertRaisesRegex(ValueError, 'parse_dates', - lambda: ks.read_csv('path', parse_dates=True)) + self.assertRaisesRegex( + ValueError, "parse_dates", lambda: ks.read_csv("path", parse_dates=True) + ) def test_read_csv_with_dtype(self): with self.csv_file(self.csv_text) as fn: self.assert_eq(ks.read_csv(fn), pd.read_csv(fn), almost=True) self.assert_eq(ks.read_csv(fn, dtype=str), pd.read_csv(fn, dtype=str)) - self.assert_eq(ks.read_csv(fn, dtype={'amount': 'int64'}), - pd.read_csv(fn, dtype={'amount': 'int64'})) + self.assert_eq( + ks.read_csv(fn, dtype={"amount": "int64"}), + pd.read_csv(fn, dtype={"amount": "int64"}), + ) def test_read_csv_with_quotechar(self): with self.csv_file(self.q_quoted_csv_text) as fn: - self.assert_eq(ks.read_csv(fn, quotechar='Q'), - pd.read_csv(fn, quotechar='Q'), almost=True) + self.assert_eq( + ks.read_csv(fn, quotechar="Q"), pd.read_csv(fn, quotechar="Q"), almost=True + ) def test_read_csv_with_escapechar(self): with self.csv_file(self.e_escapeted_csv_text) as fn: - self.assert_eq(ks.read_csv(fn, escapechar='E'), - pd.read_csv(fn, escapechar='E'), almost=True) + self.assert_eq( + ks.read_csv(fn, escapechar="E"), pd.read_csv(fn, escapechar="E"), almost=True + ) def test_to_csv(self): - pdf = pd.DataFrame({'aa': [1, 2, 3], 'bb': [4, 5, 6]}, index=[0, 1, 3]) + pdf = pd.DataFrame({"aa": [1, 2, 3], "bb": [4, 5, 6]}, index=[0, 1, 3]) kdf = ks.DataFrame(pdf) self.assert_eq(kdf.to_csv(), pdf.to_csv(index=False)) - self.assert_eq(kdf.to_csv(columns=['aa']), pdf.to_csv(columns=['aa'], index=False)) + self.assert_eq(kdf.to_csv(columns=["aa"]), pdf.to_csv(columns=["aa"], index=False)) self.assert_eq(kdf.aa.to_csv(), pdf.aa.to_csv(index=False, header=True)) - pdf = pd.DataFrame({ - 'a': [1, np.nan, 3], - 'b': ["one", "two", None], - }, index=[0, 1, 3]) + pdf = pd.DataFrame({"a": [1, np.nan, 3], "b": ["one", "two", None],}, index=[0, 1, 3]) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.to_csv(na_rep='null'), pdf.to_csv(na_rep='null', index=False)) - self.assert_eq(kdf.a.to_csv(na_rep='null'), - pdf.a.to_csv(na_rep='null', index=False, header=True)) + self.assert_eq(kdf.to_csv(na_rep="null"), pdf.to_csv(na_rep="null", index=False)) + self.assert_eq( + kdf.a.to_csv(na_rep="null"), pdf.a.to_csv(na_rep="null", index=False, header=True) + ) - pdf = pd.DataFrame({ - 'a': [1.0, 2.0, 3.0], - 'b': [4.0, 5.0, 6.0], - }, index=[0, 1, 3]) + pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0],}, index=[0, 1, 3]) kdf = ks.from_pandas(pdf) @@ -259,7 +293,7 @@ def test_to_csv(self): self.assert_eq(kdf.to_csv(), pdf.to_csv(index=False)) def test_to_csv_with_path(self): - pdf = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) + pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) kdf = ks.DataFrame(pdf) kdf.to_csv(self.tmp_dir, num_files=1) @@ -272,11 +306,11 @@ def test_to_csv_with_path(self): self.assertEqual(f.read(), expected) def test_to_csv_with_path_and_basic_options(self): - pdf = pd.DataFrame({'aa': [1, 2, 3], 'bb': ['a', 'b', 'c']}) + pdf = pd.DataFrame({"aa": [1, 2, 3], "bb": ["a", "b", "c"]}) kdf = ks.DataFrame(pdf) - kdf.to_csv(self.tmp_dir, num_files=1, sep='|', header=False, columns=['aa']) - expected = pdf.to_csv(index=False, sep='|', header=False, columns=['aa']) + kdf.to_csv(self.tmp_dir, num_files=1, sep="|", header=False, columns=["aa"]) + expected = pdf.to_csv(index=False, sep="|", header=False, columns=["aa"]) output_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("part-")] assert len(output_paths) > 0 @@ -285,15 +319,15 @@ def test_to_csv_with_path_and_basic_options(self): self.assertEqual(f.read(), expected) def test_to_csv_with_path_and_basic_options_multiindex_columns(self): - pdf = pd.DataFrame({('x', 'a'): [1, 2, 3], ('y', 'b'): ['a', 'b', 'c']}) + pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): ["a", "b", "c"]}) kdf = ks.DataFrame(pdf) with self.assertRaises(ValueError): - kdf.to_csv(self.tmp_dir, num_files=1, sep='|', columns=[('x', 'a')]) + kdf.to_csv(self.tmp_dir, num_files=1, sep="|", columns=[("x", "a")]) - kdf.to_csv(self.tmp_dir, num_files=1, sep='|', header=['a'], columns=[('x', 'a')]) - pdf.columns = ['a', 'b'] - expected = pdf.to_csv(index=False, sep='|', columns=['a']) + kdf.to_csv(self.tmp_dir, num_files=1, sep="|", header=["a"], columns=[("x", "a")]) + pdf.columns = ["a", "b"] + expected = pdf.to_csv(index=False, sep="|", columns=["a"]) output_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("part-")] assert len(output_paths) > 0 @@ -302,11 +336,11 @@ def test_to_csv_with_path_and_basic_options_multiindex_columns(self): self.assertEqual(f.read(), expected) def test_to_csv_with_path_and_pyspark_options(self): - pdf = pd.DataFrame({'a': [1, 2, 3, None], 'b': ['a', 'b', 'c', None]}) + pdf = pd.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", "c", None]}) kdf = ks.DataFrame(pdf) kdf.to_csv(self.tmp_dir, nullValue="null", num_files=1) - expected = pdf.to_csv(index=False, na_rep='null') + expected = pdf.to_csv(index=False, na_rep="null") output_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("part-")] assert len(output_paths) > 0 diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 80f9de8..d35c0e8 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -30,13 +30,12 @@ class DataFrameTest(ReusedSQLTestCase, SQLTestUtils): - @property def pdf(self): - return pd.DataFrame({ - 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], - 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0], - }, index=np.random.rand(9)) + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0],}, + index=np.random.rand(9), + ) @property def kdf(self): @@ -51,16 +50,16 @@ def df_pair(self): def test_dataframe(self): pdf, kdf = self.df_pair - expected = pd.Series([2, 3, 4, 5, 6, 7, 8, 9, 10], - index=pdf.index, - name='(a + 1)') # TODO: name='a' + expected = pd.Series( + [2, 3, 4, 5, 6, 7, 8, 9, 10], index=pdf.index, name="(a + 1)" + ) # TODO: name='a' - self.assert_eq(kdf['a'] + 1, expected) + self.assert_eq(kdf["a"] + 1, expected) - self.assert_eq(kdf.columns, pd.Index(['a', 'b'])) + self.assert_eq(kdf.columns, pd.Index(["a", "b"])) - self.assert_eq(kdf[kdf['b'] > 2], pdf[pdf['b'] > 2]) - self.assert_eq(kdf[['a', 'b']], pdf[['a', 'b']]) + self.assert_eq(kdf[kdf["b"] > 2], pdf[pdf["b"] > 2]) + self.assert_eq(kdf[["a", "b"]], pdf[["a", "b"]]) self.assert_eq(kdf.a, pdf.a) self.assert_eq(kdf.compute().b.mean(), pdf.b.mean()) self.assert_eq(np.allclose(kdf.compute().b.var(), pdf.b.var()), True) @@ -69,43 +68,46 @@ def test_dataframe(self): assert repr(kdf) pdf, kdf = self.df_pair - self.assert_eq(kdf[['a', 'b']], pdf[['a', 'b']]) + self.assert_eq(kdf[["a", "b"]], pdf[["a", "b"]]) self.assertEqual(kdf.a.notnull().alias("x").name, "x") # check ks.DataFrame(ks.Series) - pser = pd.Series([1, 2, 3], name='x', index=np.random.rand(3)) + pser = pd.Series([1, 2, 3], name="x", index=np.random.rand(3)) kser = ks.from_pandas(pser) self.assert_eq(pd.DataFrame(pser), ks.DataFrame(kser)) def test_dataframe_multiindex_columns(self): - pdf = pd.DataFrame({ - ('x', 'a', '1'): [1, 2, 3], - ('x', 'b', '2'): [4, 5, 6], - ('y.z', 'c.d', '3'): [7, 8, 9], - ('x', 'b', '4'): [10, 11, 12], - }, index=np.random.rand(3)) + pdf = pd.DataFrame( + { + ("x", "a", "1"): [1, 2, 3], + ("x", "b", "2"): [4, 5, 6], + ("y.z", "c.d", "3"): [7, 8, 9], + ("x", "b", "4"): [10, 11, 12], + }, + index=np.random.rand(3), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf) - self.assert_eq(kdf['x'], pdf['x']) - self.assert_eq(kdf['y.z'], pdf['y.z']) - self.assert_eq(kdf['x']['b'], pdf['x']['b']) - self.assert_eq(kdf['x']['b']['2'], pdf['x']['b']['2']) + self.assert_eq(kdf["x"], pdf["x"]) + self.assert_eq(kdf["y.z"], pdf["y.z"]) + self.assert_eq(kdf["x"]["b"], pdf["x"]["b"]) + self.assert_eq(kdf["x"]["b"]["2"], pdf["x"]["b"]["2"]) self.assert_eq(kdf.x, pdf.x) self.assert_eq(kdf.x.b, pdf.x.b) - self.assert_eq(kdf.x.b['2'], pdf.x.b['2']) + self.assert_eq(kdf.x.b["2"], pdf.x.b["2"]) - self.assertRaises(KeyError, lambda: kdf['z']) + self.assertRaises(KeyError, lambda: kdf["z"]) self.assertRaises(AttributeError, lambda: kdf.z) - self.assert_eq(kdf[('x',)], pdf[('x',)]) - self.assert_eq(kdf[('x', 'a')], pdf[('x', 'a')]) - self.assert_eq(kdf[('x', 'a', '1')], pdf[('x', 'a', '1')]) + self.assert_eq(kdf[("x",)], pdf[("x",)]) + self.assert_eq(kdf[("x", "a")], pdf[("x", "a")]) + self.assert_eq(kdf[("x", "a", "1")], pdf[("x", "a", "1")]) def test_dataframe_column_level_name(self): - column = pd.Index(['A', 'B', 'C'], name='X') + column = pd.Index(["A", "B", "C"], name="X") pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=column, index=np.random.rand(2)) kdf = ks.from_pandas(pdf) @@ -114,14 +116,15 @@ def test_dataframe_column_level_name(self): self.assert_eq(kdf.to_pandas().columns.names, pdf.columns.names) def test_dataframe_multiindex_names_level(self): - columns = pd.MultiIndex.from_tuples([('X', 'A', 'Z'), ('X', 'B', 'Z'), - ('Y', 'C', 'Z'), ('Y', 'D', 'Z')], - names=['lvl_1', 'lvl_2', 'lv_3']) - pdf = pd.DataFrame([[1, 2, 3, 4], - [5, 6, 7, 8], - [9, 10, 11, 12], - [13, 14, 15, 16], - [17, 18, 19, 20]], columns=columns, index=np.random.rand(5)) + columns = pd.MultiIndex.from_tuples( + [("X", "A", "Z"), ("X", "B", "Z"), ("Y", "C", "Z"), ("Y", "D", "Z")], + names=["lvl_1", "lvl_2", "lv_3"], + ) + pdf = pd.DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16], [17, 18, 19, 20]], + columns=columns, + index=np.random.rand(5), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.columns.names, pdf.columns.names) @@ -130,29 +133,32 @@ def test_dataframe_multiindex_names_level(self): kdf1 = ks.from_pandas(pdf) self.assert_eq(kdf1.columns.names, pdf.columns.names) - with self.assertRaisesRegex(ValueError, 'Column_index_names should ' - 'be list-like or None for a MultiIndex'): - ks.DataFrame(kdf1._internal.copy(column_label_names='level')) - - self.assert_eq(kdf['X'], pdf['X']) - self.assert_eq(kdf['X'].columns.names, pdf['X'].columns.names) - self.assert_eq(kdf['X'].to_pandas().columns.names, pdf['X'].columns.names) - self.assert_eq(kdf['X']['A'], pdf['X']['A']) - self.assert_eq(kdf['X']['A'].columns.names, pdf['X']['A'].columns.names) - self.assert_eq(kdf['X']['A'].to_pandas().columns.names, pdf['X']['A'].columns.names) - self.assert_eq(kdf[('X', 'A')], pdf[('X', 'A')]) - self.assert_eq(kdf[('X', 'A')].columns.names, pdf[('X', 'A')].columns.names) - self.assert_eq(kdf[('X', 'A')].to_pandas().columns.names, pdf[('X', 'A')].columns.names) - self.assert_eq(kdf[('X', 'A', 'Z')], pdf[('X', 'A', 'Z')]) + with self.assertRaisesRegex( + ValueError, "Column_index_names should " "be list-like or None for a MultiIndex" + ): + ks.DataFrame(kdf1._internal.copy(column_label_names="level")) + + self.assert_eq(kdf["X"], pdf["X"]) + self.assert_eq(kdf["X"].columns.names, pdf["X"].columns.names) + self.assert_eq(kdf["X"].to_pandas().columns.names, pdf["X"].columns.names) + self.assert_eq(kdf["X"]["A"], pdf["X"]["A"]) + self.assert_eq(kdf["X"]["A"].columns.names, pdf["X"]["A"].columns.names) + self.assert_eq(kdf["X"]["A"].to_pandas().columns.names, pdf["X"]["A"].columns.names) + self.assert_eq(kdf[("X", "A")], pdf[("X", "A")]) + self.assert_eq(kdf[("X", "A")].columns.names, pdf[("X", "A")].columns.names) + self.assert_eq(kdf[("X", "A")].to_pandas().columns.names, pdf[("X", "A")].columns.names) + self.assert_eq(kdf[("X", "A", "Z")], pdf[("X", "A", "Z")]) def test_iterrows(self): - pdf = pd.DataFrame({ - ('x', 'a', '1'): [1, 2, 3], - ('x', 'b', '2'): [4, 5, 6], - ('y.z', 'c.d', '3'): [7, 8, 9], - ('x', 'b', '4'): [10, 11, 12], - }, - index=np.random.rand(3)) + pdf = pd.DataFrame( + { + ("x", "a", "1"): [1, 2, 3], + ("x", "b", "2"): [4, 5, 6], + ("y.z", "c.d", "3"): [7, 8, 9], + ("x", "b", "4"): [10, 11, 12], + }, + index=np.random.rand(3), + ) kdf = ks.from_pandas(pdf) for (pdf_k, pdf_v), (kdf_k, kdf_v) in zip(pdf.iterrows(), kdf.iterrows()): @@ -160,176 +166,190 @@ def test_iterrows(self): self.assert_eq(pdf_v, kdf_v) def test_reset_index_with_default_index_types(self): - pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, - index=np.random.rand(3)) + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=np.random.rand(3)) kdf = ks.from_pandas(pdf) - with ks.option_context('compute.default_index_type', 'sequence'): + with ks.option_context("compute.default_index_type", "sequence"): self.assert_eq(kdf.reset_index(), pdf.reset_index()) - with ks.option_context('compute.default_index_type', 'distributed-sequence'): + with ks.option_context("compute.default_index_type", "distributed-sequence"): # the order might be changed. - self.assert_eq(kdf.reset_index().sort_index(), - pdf.reset_index()) + self.assert_eq(kdf.reset_index().sort_index(), pdf.reset_index()) - with ks.option_context('compute.default_index_type', 'distributed'): + with ks.option_context("compute.default_index_type", "distributed"): # the index is different. - self.assert_eq(kdf.reset_index().to_pandas().reset_index(drop=True), - pdf.reset_index()) + self.assert_eq(kdf.reset_index().to_pandas().reset_index(drop=True), pdf.reset_index()) def test_reset_index_with_multiindex_columns(self): - index = pd.MultiIndex.from_tuples([('bird', 'falcon'), - ('bird', 'parrot'), - ('mammal', 'lion'), - ('mammal', 'monkey')], - names=['class', 'name']) - columns = pd.MultiIndex.from_tuples([('speed', 'max'), - ('species', 'type')]) - pdf = pd.DataFrame([(389.0, 'fly'), - (24.0, 'fly'), - (80.5, 'run'), - (np.nan, 'jump')], - index=index, - columns=columns) + index = pd.MultiIndex.from_tuples( + [("bird", "falcon"), ("bird", "parrot"), ("mammal", "lion"), ("mammal", "monkey")], + names=["class", "name"], + ) + columns = pd.MultiIndex.from_tuples([("speed", "max"), ("species", "type")]) + pdf = pd.DataFrame( + [(389.0, "fly"), (24.0, "fly"), (80.5, "run"), (np.nan, "jump")], + index=index, + columns=columns, + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf) self.assert_eq(kdf.reset_index(), pdf.reset_index()) - self.assert_eq(kdf.reset_index(level='class'), pdf.reset_index(level='class')) - self.assert_eq(kdf.reset_index(level='class', col_level=1), - pdf.reset_index(level='class', col_level=1)) - self.assert_eq(kdf.reset_index(level='class', col_level=1, col_fill='species'), - pdf.reset_index(level='class', col_level=1, col_fill='species')) - self.assert_eq(kdf.reset_index(level='class', col_level=1, col_fill='genus'), - pdf.reset_index(level='class', col_level=1, col_fill='genus')) - - with self.assertRaisesRegex(IndexError, 'Index has only 2 levels, not 3'): + self.assert_eq(kdf.reset_index(level="class"), pdf.reset_index(level="class")) + self.assert_eq( + kdf.reset_index(level="class", col_level=1), pdf.reset_index(level="class", col_level=1) + ) + self.assert_eq( + kdf.reset_index(level="class", col_level=1, col_fill="species"), + pdf.reset_index(level="class", col_level=1, col_fill="species"), + ) + self.assert_eq( + kdf.reset_index(level="class", col_level=1, col_fill="genus"), + pdf.reset_index(level="class", col_level=1, col_fill="genus"), + ) + + with self.assertRaisesRegex(IndexError, "Index has only 2 levels, not 3"): kdf.reset_index(col_level=2) - pdf.index.names = [('x', 'class'), ('y', 'name')] - kdf.index.names = [('x', 'class'), ('y', 'name')] + pdf.index.names = [("x", "class"), ("y", "name")] + kdf.index.names = [("x", "class"), ("y", "name")] self.assert_eq(kdf.reset_index(), pdf.reset_index()) - with self.assertRaisesRegex(ValueError, 'Item must have length equal to number of levels.'): + with self.assertRaisesRegex(ValueError, "Item must have length equal to number of levels."): kdf.reset_index(col_level=1) def test_multiindex_column_access(self): - columns = pd.MultiIndex.from_tuples([('a', '', '', 'b'), - ('c', '', 'd', ''), - ('e', '', 'f', ''), - ('e', 'g', '', ''), - ('', '', '', 'h'), - ('i', '', '', '')]) + columns = pd.MultiIndex.from_tuples( + [ + ("a", "", "", "b"), + ("c", "", "d", ""), + ("e", "", "f", ""), + ("e", "g", "", ""), + ("", "", "", "h"), + ("i", "", "", ""), + ] + ) - pdf = pd.DataFrame([(1, 'a', 'x', 10, 100, 1000), - (2, 'b', 'y', 20, 200, 2000), - (3, 'c', 'z', 30, 300, 3000)], - columns=columns, - index=np.random.rand(3)) + pdf = pd.DataFrame( + [ + (1, "a", "x", 10, 100, 1000), + (2, "b", "y", 20, 200, 2000), + (3, "c", "z", 30, 300, 3000), + ], + columns=columns, + index=np.random.rand(3), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf) - self.assert_eq(kdf['a'], pdf['a']) - self.assert_eq(kdf['a']['b'], pdf['a']['b']) - self.assert_eq(kdf['c'], pdf['c']) - self.assert_eq(kdf['c']['d'], pdf['c']['d']) - self.assert_eq(kdf['e'], pdf['e']) - self.assert_eq(kdf['e']['']['f'], pdf['e']['']['f']) - self.assert_eq(kdf['e']['g'], pdf['e']['g']) - self.assert_eq(kdf[''], pdf['']) - self.assert_eq(kdf['']['h'], pdf['']['h']) - self.assert_eq(kdf['i'], pdf['i']) - - self.assert_eq(kdf[['a', 'e']], pdf[['a', 'e']]) - self.assert_eq(kdf[['e', 'a']], pdf[['e', 'a']]) - - self.assert_eq(kdf[('a',)], pdf[('a',)]) - self.assert_eq(kdf[('e', 'g')], pdf[('e', 'g')]) - self.assert_eq(kdf[('i',)], pdf[('i',)]) - - self.assertRaises(KeyError, lambda: kdf[('a', 'b')]) + self.assert_eq(kdf["a"], pdf["a"]) + self.assert_eq(kdf["a"]["b"], pdf["a"]["b"]) + self.assert_eq(kdf["c"], pdf["c"]) + self.assert_eq(kdf["c"]["d"], pdf["c"]["d"]) + self.assert_eq(kdf["e"], pdf["e"]) + self.assert_eq(kdf["e"][""]["f"], pdf["e"][""]["f"]) + self.assert_eq(kdf["e"]["g"], pdf["e"]["g"]) + self.assert_eq(kdf[""], pdf[""]) + self.assert_eq(kdf[""]["h"], pdf[""]["h"]) + self.assert_eq(kdf["i"], pdf["i"]) + + self.assert_eq(kdf[["a", "e"]], pdf[["a", "e"]]) + self.assert_eq(kdf[["e", "a"]], pdf[["e", "a"]]) + + self.assert_eq(kdf[("a",)], pdf[("a",)]) + self.assert_eq(kdf[("e", "g")], pdf[("e", "g")]) + self.assert_eq(kdf[("i",)], pdf[("i",)]) + + self.assertRaises(KeyError, lambda: kdf[("a", "b")]) def test_repr_cache_invalidation(self): # If there is any cache, inplace operations should invalidate it. df = ks.range(10) df.__repr__() - df['a'] = df['id'] + df["a"] = df["id"] self.assertEqual(df.__repr__(), df.to_pandas().__repr__()) def test_repr_html_cache_invalidation(self): # If there is any cache, inplace operations should invalidate it. df = ks.range(10) df._repr_html_() - df['a'] = df['id'] + df["a"] = df["id"] self.assertEqual(df._repr_html_(), df.to_pandas()._repr_html_()) def test_empty_dataframe(self): - pdf = pd.DataFrame({'a': pd.Series([], dtype='i1'), - 'b': pd.Series([], dtype='str')}) + pdf = pd.DataFrame({"a": pd.Series([], dtype="i1"), "b": pd.Series([], dtype="str")}) self.assertRaises(ValueError, lambda: ks.from_pandas(pdf)) - with self.sql_conf({'spark.sql.execution.arrow.enabled': False}): + with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): self.assertRaises(ValueError, lambda: ks.from_pandas(pdf)) def test_all_null_dataframe(self): - pdf = pd.DataFrame({'a': pd.Series([None, None, None], dtype='float64'), - 'b': pd.Series([None, None, None], dtype='str')}, - index=np.random.rand(3)) + pdf = pd.DataFrame( + { + "a": pd.Series([None, None, None], dtype="float64"), + "b": pd.Series([None, None, None], dtype="str"), + }, + index=np.random.rand(3), + ) self.assertRaises(ValueError, lambda: ks.from_pandas(pdf)) - with self.sql_conf({'spark.sql.execution.arrow.enabled': False}): + with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): self.assertRaises(ValueError, lambda: ks.from_pandas(pdf)) def test_nullable_object(self): - pdf = pd.DataFrame({'a': list('abc') + [np.nan], - 'b': list(range(1, 4)) + [np.nan], - 'c': list(np.arange(3, 6).astype('i1')) + [np.nan], - 'd': list(np.arange(4.0, 7.0, dtype='float64')) + [np.nan], - 'e': [True, False, True, np.nan], - 'f': list(pd.date_range('20130101', periods=3)) + [np.nan]}, - index=np.random.rand(4)) + pdf = pd.DataFrame( + { + "a": list("abc") + [np.nan], + "b": list(range(1, 4)) + [np.nan], + "c": list(np.arange(3, 6).astype("i1")) + [np.nan], + "d": list(np.arange(4.0, 7.0, dtype="float64")) + [np.nan], + "e": [True, False, True, np.nan], + "f": list(pd.date_range("20130101", periods=3)) + [np.nan], + }, + index=np.random.rand(4), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf) - with self.sql_conf({'spark.sql.execution.arrow.enabled': False}): + with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf) def test_assign(self): pdf, kdf = self.df_pair - kdf['w'] = 1.0 - pdf['w'] = 1.0 + kdf["w"] = 1.0 + pdf["w"] = 1.0 self.assert_eq(kdf, pdf) - kdf = kdf.assign(a=kdf['a'] * 2) - pdf = pdf.assign(a=pdf['a'] * 2) + kdf = kdf.assign(a=kdf["a"] * 2) + pdf = pdf.assign(a=pdf["a"] * 2) self.assert_eq(kdf, pdf) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'w')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "w")]) pdf.columns = columns kdf.columns = columns - kdf[('a', 'c')] = 'def' - pdf[('a', 'c')] = 'def' + kdf[("a", "c")] = "def" + pdf[("a", "c")] = "def" self.assert_eq(kdf, pdf) - kdf = kdf.assign(Z='ZZ') - pdf = pdf.assign(Z='ZZ') + kdf = kdf.assign(Z="ZZ") + pdf = pdf.assign(Z="ZZ") self.assert_eq(kdf, pdf) - kdf['x'] = 'ghi' - pdf['x'] = 'ghi' + kdf["x"] = "ghi" + pdf["x"] = "ghi" self.assert_eq(kdf, pdf) @@ -342,43 +362,43 @@ def test_head_tail(self): def test_attributes(self): kdf = self.kdf - self.assertIn('a', dir(kdf)) - self.assertNotIn('foo', dir(kdf)) + self.assertIn("a", dir(kdf)) + self.assertNotIn("foo", dir(kdf)) self.assertRaises(AttributeError, lambda: kdf.foo) - kdf = ks.DataFrame({'a b c': [1, 2, 3]}) - self.assertNotIn('a b c', dir(kdf)) - kdf = ks.DataFrame({'a': [1, 2], 5: [1, 2]}) - self.assertIn('a', dir(kdf)) + kdf = ks.DataFrame({"a b c": [1, 2, 3]}) + self.assertNotIn("a b c", dir(kdf)) + kdf = ks.DataFrame({"a": [1, 2], 5: [1, 2]}) + self.assertIn("a", dir(kdf)) self.assertNotIn(5, dir(kdf)) def test_column_names(self): kdf = self.kdf - self.assert_eq(kdf.columns, pd.Index(['a', 'b'])) - self.assert_eq(kdf[['b', 'a']].columns, pd.Index(['b', 'a'])) - self.assertEqual(kdf['a'].name, 'a') - self.assertEqual((kdf['a'] + 1).name, 'a') - self.assertEqual((kdf['a'] + kdf['b']).name, 'a') # TODO: None + self.assert_eq(kdf.columns, pd.Index(["a", "b"])) + self.assert_eq(kdf[["b", "a"]].columns, pd.Index(["b", "a"])) + self.assertEqual(kdf["a"].name, "a") + self.assertEqual((kdf["a"] + 1).name, "a") + self.assertEqual((kdf["a"] + kdf["b"]).name, "a") # TODO: None def test_rename_columns(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], - 'b': [7, 6, 5, 4, 3, 2, 1]}, - index=np.random.rand(7)) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7) + ) kdf = ks.from_pandas(pdf) - kdf.columns = ['x', 'y'] - pdf.columns = ['x', 'y'] - self.assert_eq(kdf.columns, pd.Index(['x', 'y'])) + kdf.columns = ["x", "y"] + pdf.columns = ["x", "y"] + self.assert_eq(kdf.columns, pd.Index(["x", "y"])) self.assert_eq(kdf, pdf) - self.assert_eq(kdf._internal.data_columns, ['x', 'y']) - self.assert_eq(kdf._internal.spark_df.columns, ['x', 'y']) + self.assert_eq(kdf._internal.data_columns, ["x", "y"]) + self.assert_eq(kdf._internal.spark_df.columns, ["x", "y"]) columns = pdf.columns - columns.name = 'lvl_1' + columns.name = "lvl_1" kdf.columns = columns - self.assert_eq(kdf.columns.names, ['lvl_1']) + self.assert_eq(kdf.columns.names, ["lvl_1"]) self.assert_eq(kdf, pdf) msg = "Length mismatch: Expected axis has 2 elements, new values have 4 elements" @@ -386,20 +406,21 @@ def test_rename_columns(self): kdf.columns = [1, 2, 3, 4] # Multi-index columns - pdf = pd.DataFrame({('A', '0'): [1, 2, 2, 3], ('B', '1'): [1, 2, 3, 4]}, - index=np.random.rand(4)) + pdf = pd.DataFrame( + {("A", "0"): [1, 2, 2, 3], ("B", "1"): [1, 2, 3, 4]}, index=np.random.rand(4) + ) kdf = ks.from_pandas(pdf) columns = pdf.columns self.assert_eq(kdf.columns, columns) self.assert_eq(kdf, pdf) - pdf.columns = ['x', 'y'] - kdf.columns = ['x', 'y'] - self.assert_eq(kdf.columns, pd.Index(['x', 'y'])) + pdf.columns = ["x", "y"] + kdf.columns = ["x", "y"] + self.assert_eq(kdf.columns, pd.Index(["x", "y"])) self.assert_eq(kdf, pdf) - self.assert_eq(kdf._internal.data_columns, ['x', 'y']) - self.assert_eq(kdf._internal.spark_df.columns, ['x', 'y']) + self.assert_eq(kdf._internal.data_columns, ["x", "y"]) + self.assert_eq(kdf._internal.spark_df.columns, ["x", "y"]) pdf.columns = columns kdf.columns = columns @@ -408,10 +429,10 @@ def test_rename_columns(self): self.assert_eq(kdf._internal.data_columns, ["(A, 0)", "(B, 1)"]) self.assert_eq(kdf._internal.spark_df.columns, ["(A, 0)", "(B, 1)"]) - columns.names = ['lvl_1', 'lvl_2'] + columns.names = ["lvl_1", "lvl_2"] kdf.columns = columns - self.assert_eq(kdf.columns.names, ['lvl_1', 'lvl_2']) + self.assert_eq(kdf.columns.names, ["lvl_1", "lvl_2"]) self.assert_eq(kdf, pdf) self.assert_eq(kdf._internal.data_columns, ["(A, 0)", "(B, 1)"]) self.assert_eq(kdf._internal.spark_df.columns, ["(A, 0)", "(B, 1)"]) @@ -419,73 +440,87 @@ def test_rename_columns(self): def test_rename_dataframe(self): kdf1 = ks.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) result_kdf = kdf1.rename(columns={"A": "a", "B": "b"}) - self.assert_eq(result_kdf.columns, pd.Index(['a', 'b'])) + self.assert_eq(result_kdf.columns, pd.Index(["a", "b"])) result_kdf = kdf1.rename(index={1: 10, 2: 20}) self.assert_eq(result_kdf.index, pd.Index([0, 10, 20])) - self.assertTrue(kdf1 is not result_kdf, - "expect return new dataframe when inplace argument is False") + self.assertTrue( + kdf1 is not result_kdf, "expect return new dataframe when inplace argument is False" + ) result_kdf2 = result_kdf.rename(index={1: 10, 2: 20}, inplace=True) - self.assertTrue(result_kdf2 is result_kdf, - "expect return the same dataframe when inplace argument is False") + self.assertTrue( + result_kdf2 is result_kdf, + "expect return the same dataframe when inplace argument is False", + ) def str_lower(s) -> str: return str.lower(s) - result_kdf = kdf1.rename(str_lower, axis='columns') - self.assert_eq(result_kdf.columns, pd.Index(['a', 'b'])) + result_kdf = kdf1.rename(str_lower, axis="columns") + self.assert_eq(result_kdf.columns, pd.Index(["a", "b"])) def mul10(x) -> int: return x * 10 - result_kdf = kdf1.rename(mul10, axis='index') + result_kdf = kdf1.rename(mul10, axis="index") self.assert_eq(result_kdf.index, pd.Index([0, 10, 20])) result_kdf = kdf1.rename(columns=str_lower, index={1: 10, 2: 20}) - self.assert_eq(result_kdf.columns, pd.Index(['a', 'b'])) + self.assert_eq(result_kdf.columns, pd.Index(["a", "b"])) self.assert_eq(result_kdf.index, pd.Index([0, 10, 20])) - idx = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C'), ('Y', 'D')]) + idx = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Y", "D")]) kdf2 = ks.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=idx) result_kdf = kdf2.rename(columns=str_lower) - self.assert_eq(result_kdf.columns, - pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c'), ('y', 'd')])) + self.assert_eq( + result_kdf.columns, + pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c"), ("y", "d")]), + ) result_kdf = kdf2.rename(columns=str_lower, level=0) - self.assert_eq(result_kdf.columns, - pd.MultiIndex.from_tuples([('x', 'A'), ('x', 'B'), ('y', 'C'), ('y', 'D')])) + self.assert_eq( + result_kdf.columns, + pd.MultiIndex.from_tuples([("x", "A"), ("x", "B"), ("y", "C"), ("y", "D")]), + ) result_kdf = kdf2.rename(columns=str_lower, level=1) - self.assert_eq(result_kdf.columns, - pd.MultiIndex.from_tuples([('X', 'a'), ('X', 'b'), ('Y', 'c'), ('Y', 'd')])) + self.assert_eq( + result_kdf.columns, + pd.MultiIndex.from_tuples([("X", "a"), ("X", "b"), ("Y", "c"), ("Y", "d")]), + ) - kdf3 = ks.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=idx, columns=list('ab')) + kdf3 = ks.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=idx, columns=list("ab")) # for spark 2.3, disable arrow optimization. Because koalas multi-index do not support # arrow optimization in spark 2.3. result_kdf = kdf3.rename(index=str_lower) - self.assert_eq(result_kdf.index, - pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c'), ('y', 'd')])) + self.assert_eq( + result_kdf.index, + pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c"), ("y", "d")]), + ) result_kdf = kdf3.rename(index=str_lower, level=0) - self.assert_eq(result_kdf.index, - pd.MultiIndex.from_tuples([('x', 'A'), ('x', 'B'), ('y', 'C'), ('y', 'D')])) + self.assert_eq( + result_kdf.index, + pd.MultiIndex.from_tuples([("x", "A"), ("x", "B"), ("y", "C"), ("y", "D")]), + ) result_kdf = kdf3.rename(index=str_lower, level=1) - self.assert_eq(result_kdf.index, - pd.MultiIndex.from_tuples([('X', 'a'), ('X', 'b'), ('Y', 'c'), ('Y', 'd')])) + self.assert_eq( + result_kdf.index, + pd.MultiIndex.from_tuples([("X", "a"), ("X", "b"), ("Y", "c"), ("Y", "d")]), + ) def test_dot_in_column_name(self): self.assert_eq( - ks.DataFrame(ks.range(1)._sdf.selectExpr("1 as `a.b`"))['a.b'], - ks.Series([1])) + ks.DataFrame(ks.range(1)._sdf.selectExpr("1 as `a.b`"))["a.b"], ks.Series([1]) + ) def test_drop(self): - pdf = pd.DataFrame({'x': [1, 2], 'y': [3, 4], 'z': [5, 6]}, - index=np.random.rand(2)) + pdf = pd.DataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]}, index=np.random.rand(2)) kdf = ks.from_pandas(pdf) # Assert 'labels' or 'columns' parameter is set @@ -494,51 +529,57 @@ def test_drop(self): kdf.drop() # Assert axis cannot be 0 with self.assertRaisesRegex(NotImplementedError, "Drop currently only works for axis=1"): - kdf.drop('x', axis=0) + kdf.drop("x", axis=0) # Assert using a str for 'labels' works - self.assert_eq(kdf.drop('x', axis=1), pdf.drop('x', axis=1)) + self.assert_eq(kdf.drop("x", axis=1), pdf.drop("x", axis=1)) # Assert axis is 1 by default - self.assert_eq(kdf.drop('x'), pdf.drop('x', axis=1)) + self.assert_eq(kdf.drop("x"), pdf.drop("x", axis=1)) # Assert using a list for 'labels' works - self.assert_eq(kdf.drop(['y', 'z'], axis=1), pdf.drop(['y', 'z'], axis=1)) + self.assert_eq(kdf.drop(["y", "z"], axis=1), pdf.drop(["y", "z"], axis=1)) # Assert using 'columns' instead of 'labels' produces the same results - self.assert_eq(kdf.drop(columns='x'), pdf.drop(columns='x')) - self.assert_eq(kdf.drop(columns=['y', 'z']), pdf.drop(columns=['y', 'z'])) + self.assert_eq(kdf.drop(columns="x"), pdf.drop(columns="x")) + self.assert_eq(kdf.drop(columns=["y", "z"]), pdf.drop(columns=["y", "z"])) # Assert 'labels' being used when both 'labels' and 'columns' are specified # TODO: should throw an error? - expected_output = pd.DataFrame({'y': [3, 4], 'z': [5, 6]}, index=kdf.index.to_pandas()) - self.assert_eq(kdf.drop(labels=['x'], columns=['y']), expected_output) + expected_output = pd.DataFrame({"y": [3, 4], "z": [5, 6]}, index=kdf.index.to_pandas()) + self.assert_eq(kdf.drop(labels=["x"], columns=["y"]), expected_output) - columns = pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]) + columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) pdf.columns = columns kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.drop(columns='a'), pdf.drop(columns='a')) - self.assert_eq(kdf.drop(columns=('a', 'x')), pdf.drop(columns=('a', 'x'))) - self.assert_eq(kdf.drop(columns=[('a', 'x'), 'b']), pdf.drop(columns=[('a', 'x'), 'b'])) + self.assert_eq(kdf.drop(columns="a"), pdf.drop(columns="a")) + self.assert_eq(kdf.drop(columns=("a", "x")), pdf.drop(columns=("a", "x"))) + self.assert_eq(kdf.drop(columns=[("a", "x"), "b"]), pdf.drop(columns=[("a", "x"), "b"])) - self.assertRaises(KeyError, lambda: kdf.drop(columns='c')) - self.assertRaises(KeyError, lambda: kdf.drop(columns=('a', 'z'))) + self.assertRaises(KeyError, lambda: kdf.drop(columns="c")) + self.assertRaises(KeyError, lambda: kdf.drop(columns=("a", "z"))) def test_dropna(self): - pdf = pd.DataFrame({'x': [np.nan, 2, 3, 4, np.nan, 6], - 'y': [1, 2, np.nan, 4, np.nan, np.nan], - 'z': [1, 2, 3, 4, np.nan, np.nan]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + "x": [np.nan, 2, 3, 4, np.nan, 6], + "y": [1, 2, np.nan, 4, np.nan, np.nan], + "z": [1, 2, 3, 4, np.nan, np.nan], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.dropna(), pdf.dropna()) - self.assert_eq(kdf.dropna(how='all'), pdf.dropna(how='all')) - self.assert_eq(kdf.dropna(subset=['x']), pdf.dropna(subset=['x'])) - self.assert_eq(kdf.dropna(subset='x'), pdf.dropna(subset=['x'])) - self.assert_eq(kdf.dropna(subset=['y', 'z']), pdf.dropna(subset=['y', 'z'])) - self.assert_eq(kdf.dropna(subset=['y', 'z'], how='all'), - pdf.dropna(subset=['y', 'z'], how='all')) + self.assert_eq(kdf.dropna(how="all"), pdf.dropna(how="all")) + self.assert_eq(kdf.dropna(subset=["x"]), pdf.dropna(subset=["x"])) + self.assert_eq(kdf.dropna(subset="x"), pdf.dropna(subset=["x"])) + self.assert_eq(kdf.dropna(subset=["y", "z"]), pdf.dropna(subset=["y", "z"])) + self.assert_eq( + kdf.dropna(subset=["y", "z"], how="all"), pdf.dropna(subset=["y", "z"], how="all") + ) self.assert_eq(kdf.dropna(thresh=2), pdf.dropna(thresh=2)) - self.assert_eq(kdf.dropna(thresh=1, subset=['y', 'z']), - pdf.dropna(thresh=1, subset=['y', 'z'])) + self.assert_eq( + kdf.dropna(thresh=1, subset=["y", "z"]), pdf.dropna(thresh=1, subset=["y", "z"]) + ) ddf2 = kdf.copy() ddf2.dropna(inplace=True) @@ -548,167 +589,194 @@ def test_dropna(self): with self.assertRaisesRegex(NotImplementedError, msg): kdf.dropna(axis=1) with self.assertRaisesRegex(NotImplementedError, msg): - kdf.dropna(axis='columns') - with self.assertRaisesRegex(ValueError, 'No axis named foo'): - kdf.dropna(axis='foo') + kdf.dropna(axis="columns") + with self.assertRaisesRegex(ValueError, "No axis named foo"): + kdf.dropna(axis="foo") - self.assertRaises(KeyError, lambda: kdf.dropna(subset='1')) + self.assertRaises(KeyError, lambda: kdf.dropna(subset="1")) with self.assertRaisesRegex(ValueError, "invalid how option: 1"): kdf.dropna(how=1) with self.assertRaisesRegex(TypeError, "must specify how or thresh"): kdf.dropna(how=None) # multi-index columns - columns = pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]) + columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.dropna(), pdf.dropna()) - self.assert_eq(kdf.dropna(how='all'), pdf.dropna(how='all')) - self.assert_eq(kdf.dropna(subset=[('a', 'x')]), pdf.dropna(subset=[('a', 'x')])) - self.assert_eq(kdf.dropna(subset=('a', 'x')), pdf.dropna(subset=[('a', 'x')])) - self.assert_eq(kdf.dropna(subset=[('a', 'y'), ('b', 'z')]), - pdf.dropna(subset=[('a', 'y'), ('b', 'z')])) - self.assert_eq(kdf.dropna(subset=[('a', 'y'), ('b', 'z')], how='all'), - pdf.dropna(subset=[('a', 'y'), ('b', 'z')], how='all')) + self.assert_eq(kdf.dropna(how="all"), pdf.dropna(how="all")) + self.assert_eq(kdf.dropna(subset=[("a", "x")]), pdf.dropna(subset=[("a", "x")])) + self.assert_eq(kdf.dropna(subset=("a", "x")), pdf.dropna(subset=[("a", "x")])) + self.assert_eq( + kdf.dropna(subset=[("a", "y"), ("b", "z")]), pdf.dropna(subset=[("a", "y"), ("b", "z")]) + ) + self.assert_eq( + kdf.dropna(subset=[("a", "y"), ("b", "z")], how="all"), + pdf.dropna(subset=[("a", "y"), ("b", "z")], how="all"), + ) self.assert_eq(kdf.dropna(thresh=2), pdf.dropna(thresh=2)) - self.assert_eq(kdf.dropna(thresh=1, subset=[('a', 'y'), ('b', 'z')]), - pdf.dropna(thresh=1, subset=[('a', 'y'), ('b', 'z')])) + self.assert_eq( + kdf.dropna(thresh=1, subset=[("a", "y"), ("b", "z")]), + pdf.dropna(thresh=1, subset=[("a", "y"), ("b", "z")]), + ) def test_dtype(self): - pdf = pd.DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('i1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.date_range('20130101', periods=3)}, - index=np.random.rand(3)) + pdf = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("i1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3), + }, + index=np.random.rand(3), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf) self.assertTrue((kdf.dtypes == pdf.dtypes).all()) # multi-index columns - columns = pd.MultiIndex.from_tuples(zip(list('xxxyyz'), list('abcdef'))) + columns = pd.MultiIndex.from_tuples(zip(list("xxxyyz"), list("abcdef"))) pdf.columns = columns kdf.columns = columns self.assertTrue((kdf.dtypes == pdf.dtypes).all()) def test_fillna(self): - pdf = pd.DataFrame({'x': [np.nan, 2, 3, 4, np.nan, 6], - 'y': [1, 2, np.nan, 4, np.nan, np.nan], - 'z': [1, 2, 3, 4, np.nan, np.nan]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + "x": [np.nan, 2, 3, 4, np.nan, 6], + "y": [1, 2, np.nan, 4, np.nan, np.nan], + "z": [1, 2, 3, 4, np.nan, np.nan], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf) self.assert_eq(kdf.fillna(-1), pdf.fillna(-1)) - self.assert_eq(kdf.fillna({'x': -1, 'y': -2, 'z': -5}), - pdf.fillna({'x': -1, 'y': -2, 'z': -5})) - self.assert_eq(pdf.fillna(method='ffill'), kdf.fillna(method='ffill')) - self.assert_eq(pdf.fillna(method='ffill', limit=2), kdf.fillna(method='ffill', limit=2)) - self.assert_eq(pdf.fillna(method='bfill'), kdf.fillna(method='bfill')) - self.assert_eq(pdf.fillna(method='bfill', limit=2), kdf.fillna(method='bfill', limit=2)) + self.assert_eq( + kdf.fillna({"x": -1, "y": -2, "z": -5}), pdf.fillna({"x": -1, "y": -2, "z": -5}) + ) + self.assert_eq(pdf.fillna(method="ffill"), kdf.fillna(method="ffill")) + self.assert_eq(pdf.fillna(method="ffill", limit=2), kdf.fillna(method="ffill", limit=2)) + self.assert_eq(pdf.fillna(method="bfill"), kdf.fillna(method="bfill")) + self.assert_eq(pdf.fillna(method="bfill", limit=2), kdf.fillna(method="bfill", limit=2)) - pdf = pdf.set_index(['x', 'y']) + pdf = pdf.set_index(["x", "y"]) kdf = ks.from_pandas(pdf) # check multi index self.assert_eq(kdf.fillna(-1), pdf.fillna(-1)) - self.assert_eq(pdf.fillna(method='bfill'), kdf.fillna(method='bfill')) - self.assert_eq(pdf.fillna(method='ffill'), kdf.fillna(method='ffill')) + self.assert_eq(pdf.fillna(method="bfill"), kdf.fillna(method="bfill")) + self.assert_eq(pdf.fillna(method="ffill"), kdf.fillna(method="ffill")) - pdf.fillna({'x': -1, 'y': -2, 'z': -5}, inplace=True) - kdf.fillna({'x': -1, 'y': -2, 'z': -5}, inplace=True) + pdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True) + kdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True) self.assert_eq(kdf, pdf) - s_nan = pd.Series([-1, -2, -5], index=['x', 'y', 'z'], dtype=int) - self.assert_eq(kdf.fillna(s_nan), - pdf.fillna(s_nan)) + s_nan = pd.Series([-1, -2, -5], index=["x", "y", "z"], dtype=int) + self.assert_eq(kdf.fillna(s_nan), pdf.fillna(s_nan)) with self.assertRaisesRegex(NotImplementedError, "fillna currently only"): kdf.fillna(-1, axis=1) with self.assertRaisesRegex(NotImplementedError, "fillna currently only"): - kdf.fillna(-1, axis='columns') + kdf.fillna(-1, axis="columns") with self.assertRaisesRegex(ValueError, "limit parameter for value is not support now"): kdf.fillna(-1, limit=1) with self.assertRaisesRegex(TypeError, "Unsupported.*DataFrame"): - kdf.fillna(pd.DataFrame({'x': [-1], 'y': [-1], 'z': [-1]})) + kdf.fillna(pd.DataFrame({"x": [-1], "y": [-1], "z": [-1]})) with self.assertRaisesRegex(TypeError, "Unsupported.*numpy.int64"): - kdf.fillna({'x': np.int64(-6), 'y': np.int64(-4), 'z': -5}) + kdf.fillna({"x": np.int64(-6), "y": np.int64(-4), "z": -5}) with self.assertRaisesRegex(ValueError, "Expecting 'pad', 'ffill', 'backfill' or 'bfill'."): - kdf.fillna(method='xxx') - with self.assertRaisesRegex(ValueError, - "Must specify a fillna 'value' or 'method' parameter."): + kdf.fillna(method="xxx") + with self.assertRaisesRegex( + ValueError, "Must specify a fillna 'value' or 'method' parameter." + ): kdf.fillna() # multi-index columns - pdf = pd.DataFrame({('x', 'a'): [np.nan, 2, 3, 4, np.nan, 6], - ('x', 'b'): [1, 2, np.nan, 4, np.nan, np.nan], - ('y', 'c'): [1, 2, 3, 4, np.nan, np.nan]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + ("x", "a"): [np.nan, 2, 3, 4, np.nan, 6], + ("x", "b"): [1, 2, np.nan, 4, np.nan, np.nan], + ("y", "c"): [1, 2, 3, 4, np.nan, np.nan], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.fillna(-1), pdf.fillna(-1)) - self.assert_eq(kdf.fillna({('x', 'a'): -1, ('x', 'b'): -2, ('y', 'c'): -5}), - pdf.fillna({('x', 'a'): -1, ('x', 'b'): -2, ('y', 'c'): -5})) - self.assert_eq(pdf.fillna(method='ffill'), kdf.fillna(method='ffill')) - self.assert_eq(pdf.fillna(method='ffill', limit=2), kdf.fillna(method='ffill', limit=2)) - self.assert_eq(pdf.fillna(method='bfill'), kdf.fillna(method='bfill')) - self.assert_eq(pdf.fillna(method='bfill', limit=2), kdf.fillna(method='bfill', limit=2)) + self.assert_eq( + kdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}), + pdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}), + ) + self.assert_eq(pdf.fillna(method="ffill"), kdf.fillna(method="ffill")) + self.assert_eq(pdf.fillna(method="ffill", limit=2), kdf.fillna(method="ffill", limit=2)) + self.assert_eq(pdf.fillna(method="bfill"), kdf.fillna(method="bfill")) + self.assert_eq(pdf.fillna(method="bfill", limit=2), kdf.fillna(method="bfill", limit=2)) - self.assert_eq(kdf.fillna({'x': -1}), pdf.fillna({'x': -1})) + self.assert_eq(kdf.fillna({"x": -1}), pdf.fillna({"x": -1})) if sys.version_info >= (3, 6): # flaky in Python 3.5. - self.assert_eq(kdf.fillna({'x': -1, ('x', 'b'): -2}), - pdf.fillna({'x': -1, ('x', 'b'): -2})) - self.assert_eq(kdf.fillna({('x', 'b'): -2, 'x': -1}), - pdf.fillna({('x', 'b'): -2, 'x': -1})) + self.assert_eq( + kdf.fillna({"x": -1, ("x", "b"): -2}), pdf.fillna({"x": -1, ("x", "b"): -2}) + ) + self.assert_eq( + kdf.fillna({("x", "b"): -2, "x": -1}), pdf.fillna({("x", "b"): -2, "x": -1}) + ) # check multi index - pdf = pdf.set_index([('x', 'a'), ('x', 'b')]) + pdf = pdf.set_index([("x", "a"), ("x", "b")]) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.fillna(-1), pdf.fillna(-1)) - self.assert_eq(kdf.fillna({('x', 'a'): -1, ('x', 'b'): -2, ('y', 'c'): -5}), - pdf.fillna({('x', 'a'): -1, ('x', 'b'): -2, ('y', 'c'): -5})) + self.assert_eq( + kdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}), + pdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}), + ) def test_isnull(self): - pdf = pd.DataFrame({'x': [1, 2, 3, 4, None, 6], 'y': list('abdabd')}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + {"x": [1, 2, 3, 4, None, 6], "y": list("abdabd")}, index=np.random.rand(6) + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.notnull(), pdf.notnull()) self.assert_eq(kdf.isnull(), pdf.isnull()) def test_to_datetime(self): - pdf = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}, - index=np.random.rand(2)) + pdf = pd.DataFrame( + {"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}, index=np.random.rand(2) + ) kdf = ks.from_pandas(pdf) self.assert_eq(pd.to_datetime(pdf), ks.to_datetime(kdf)) def test_nunique(self): - pdf = pd.DataFrame({'A': [1, 2, 3], 'B': [np.nan, 3, np.nan]}, - index=np.random.rand(3)) + pdf = pd.DataFrame({"A": [1, 2, 3], "B": [np.nan, 3, np.nan]}, index=np.random.rand(3)) kdf = ks.from_pandas(pdf) # Assert NaNs are dropped by default nunique_result = kdf.nunique() - self.assert_eq(nunique_result, pd.Series([3, 1], index=['A', 'B'], name='0')) + self.assert_eq(nunique_result, pd.Series([3, 1], index=["A", "B"], name="0")) self.assert_eq(nunique_result, pdf.nunique()) # Assert including NaN values nunique_result = kdf.nunique(dropna=False) - self.assert_eq(nunique_result, pd.Series([3, 2], index=['A', 'B'], name='0')) + self.assert_eq(nunique_result, pd.Series([3, 2], index=["A", "B"], name="0")) self.assert_eq(nunique_result, pdf.nunique(dropna=False)) # Assert approximate counts - self.assert_eq(ks.DataFrame({'A': range(100)}).nunique(approx=True), - pd.Series([103], index=['A'], name='0')) - self.assert_eq(ks.DataFrame({'A': range(100)}).nunique(approx=True, rsd=0.01), - pd.Series([100], index=['A'], name='0')) + self.assert_eq( + ks.DataFrame({"A": range(100)}).nunique(approx=True), + pd.Series([103], index=["A"], name="0"), + ) + self.assert_eq( + ks.DataFrame({"A": range(100)}).nunique(approx=True, rsd=0.01), + pd.Series([100], index=["A"], name="0"), + ) # Assert unsupported axis value yet msg = 'axis should be either 0 or "index" currently.' @@ -716,7 +784,7 @@ def test_nunique(self): kdf.nunique(axis=1) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('Y', 'B')], names=['1', '2']) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("Y", "B")], names=["1", "2"]) pdf.columns = columns kdf.columns = columns @@ -724,158 +792,186 @@ def test_nunique(self): self.assert_eq(kdf.nunique(dropna=False), pdf.nunique(dropna=False)) def test_sort_values(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, None, 7], - 'b': [7, 6, 5, 4, 3, 2, 1]}, - index=np.random.rand(7)) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7) + ) kdf = ks.from_pandas(pdf) - self.assert_eq(repr(kdf.sort_values('b')), repr(pdf.sort_values('b'))) - self.assert_eq(repr(kdf.sort_values(['b', 'a'])), repr(pdf.sort_values(['b', 'a']))) + self.assert_eq(repr(kdf.sort_values("b")), repr(pdf.sort_values("b"))) + self.assert_eq(repr(kdf.sort_values(["b", "a"])), repr(pdf.sort_values(["b", "a"]))) self.assert_eq( - repr(kdf.sort_values(['b', 'a'], ascending=[False, True])), - repr(pdf.sort_values(['b', 'a'], ascending=[False, True]))) + repr(kdf.sort_values(["b", "a"], ascending=[False, True])), + repr(pdf.sort_values(["b", "a"], ascending=[False, True])), + ) - self.assertRaises(ValueError, lambda: kdf.sort_values(['b', 'a'], ascending=[False])) + self.assertRaises(ValueError, lambda: kdf.sort_values(["b", "a"], ascending=[False])) self.assert_eq( - repr(kdf.sort_values(['b', 'a'], na_position='first')), - repr(pdf.sort_values(['b', 'a'], na_position='first'))) + repr(kdf.sort_values(["b", "a"], na_position="first")), + repr(pdf.sort_values(["b", "a"], na_position="first")), + ) - self.assertRaises(ValueError, lambda: kdf.sort_values(['b', 'a'], na_position='invalid')) + self.assertRaises(ValueError, lambda: kdf.sort_values(["b", "a"], na_position="invalid")) - self.assert_eq(kdf.sort_values('b', inplace=True), pdf.sort_values('b', inplace=True)) + self.assert_eq(kdf.sort_values("b", inplace=True), pdf.sort_values("b", inplace=True)) self.assert_eq(repr(kdf), repr(pdf)) - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")]) kdf.columns = columns self.assertRaisesRegex( ValueError, "For a multi-index, the label must be a tuple with elements", - lambda: kdf.sort_values(['X'])) + lambda: kdf.sort_values(["X"]), + ) def test_sort_index(self): - pdf = pd.DataFrame({'A': [2, 1, np.nan], 'B': [np.nan, 0, np.nan]}, - index=['b', 'a', np.nan]) + pdf = pd.DataFrame( + {"A": [2, 1, np.nan], "B": [np.nan, 0, np.nan]}, index=["b", "a", np.nan] + ) kdf = ks.from_pandas(pdf) # Assert invalid parameters self.assertRaises(NotImplementedError, lambda: kdf.sort_index(axis=1)) - self.assertRaises(NotImplementedError, lambda: kdf.sort_index(kind='mergesort')) - self.assertRaises(ValueError, lambda: kdf.sort_index(na_position='invalid')) + self.assertRaises(NotImplementedError, lambda: kdf.sort_index(kind="mergesort")) + self.assertRaises(ValueError, lambda: kdf.sort_index(na_position="invalid")) # Assert default behavior without parameters self.assert_eq(kdf.sort_index(), pdf.sort_index()) # Assert sorting descending self.assert_eq(kdf.sort_index(ascending=False), pdf.sort_index(ascending=False)) # Assert sorting NA indices first - self.assert_eq(kdf.sort_index(na_position='first'), pdf.sort_index(na_position='first')) + self.assert_eq(kdf.sort_index(na_position="first"), pdf.sort_index(na_position="first")) # Assert sorting inplace self.assertEqual(kdf.sort_index(inplace=True), pdf.sort_index(inplace=True)) self.assert_eq(kdf, pdf) # Assert multi-indices - pdf = pd.DataFrame({'A': range(4), 'B': range(4)[::-1]}, - index=[['b', 'b', 'a', 'a'], [1, 0, 1, 0]]) + pdf = pd.DataFrame( + {"A": range(4), "B": range(4)[::-1]}, index=[["b", "b", "a", "a"], [1, 0, 1, 0]] + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.sort_index(), pdf.sort_index()) self.assert_eq(kdf.sort_index(level=[1, 0]), pdf.sort_index(level=[1, 0])) self.assert_eq(kdf.reset_index().sort_index(), pdf.reset_index().sort_index()) # Assert with multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")]) pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.sort_index(), pdf.sort_index()) def test_nlargest(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, None, 7], - 'b': [7, 6, 5, 4, 3, 2, 1]}, - index=np.random.rand(7)) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7) + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.nlargest(n=5, columns='a'), pdf.nlargest(5, columns='a')) - self.assert_eq(kdf.nlargest(n=5, columns=['a', 'b']), pdf.nlargest(5, columns=['a', 'b'])) + self.assert_eq(kdf.nlargest(n=5, columns="a"), pdf.nlargest(5, columns="a")) + self.assert_eq(kdf.nlargest(n=5, columns=["a", "b"]), pdf.nlargest(5, columns=["a", "b"])) def test_nsmallest(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, None, 7], - 'b': [7, 6, 5, 4, 3, 2, 1]}, - index=np.random.rand(7)) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7) + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.nsmallest(n=5, columns='a'), pdf.nsmallest(5, columns='a')) - self.assert_eq(kdf.nsmallest(n=5, columns=['a', 'b']), pdf.nsmallest(5, columns=['a', 'b'])) + self.assert_eq(kdf.nsmallest(n=5, columns="a"), pdf.nsmallest(5, columns="a")) + self.assert_eq(kdf.nsmallest(n=5, columns=["a", "b"]), pdf.nsmallest(5, columns=["a", "b"])) def test_xs(self): - d = {'num_legs': [4, 4, 2, 2], - 'num_wings': [0, 0, 2, 2], - 'class': ['mammal', 'mammal', 'mammal', 'bird'], - 'animal': ['cat', 'dog', 'bat', 'penguin'], - 'locomotion': ['walks', 'walks', 'flies', 'walks']} + d = { + "num_legs": [4, 4, 2, 2], + "num_wings": [0, 0, 2, 2], + "class": ["mammal", "mammal", "mammal", "bird"], + "animal": ["cat", "dog", "bat", "penguin"], + "locomotion": ["walks", "walks", "flies", "walks"], + } kdf = ks.DataFrame(data=d) - kdf = kdf.set_index(['class', 'animal', 'locomotion']) + kdf = kdf.set_index(["class", "animal", "locomotion"]) pdf = kdf.to_pandas() - self.assert_eq(kdf.xs(('mammal', 'dog', 'walks')), pdf.xs(('mammal', 'dog', 'walks'))) + self.assert_eq(kdf.xs(("mammal", "dog", "walks")), pdf.xs(("mammal", "dog", "walks"))) msg = "'key' should be string or tuple that contains strings" with self.assertRaisesRegex(ValueError, msg): kdf.xs(1) - msg = ("'key' should have index names as only strings " - "or a tuple that contain index names as only strings") + msg = ( + "'key' should have index names as only strings " + "or a tuple that contain index names as only strings" + ) with self.assertRaisesRegex(ValueError, msg): - kdf.xs(('mammal', 1)) + kdf.xs(("mammal", 1)) msg = 'axis should be either 0 or "index" currently.' with self.assertRaisesRegex(NotImplementedError, msg): - kdf.xs('num_wings', axis=1) + kdf.xs("num_wings", axis=1) msg = r"'Key length \(4\) exceeds index depth \(3\)'" with self.assertRaisesRegex(KeyError, msg): - kdf.xs(('mammal', 'dog', 'walks', 'foo')) + kdf.xs(("mammal", "dog", "walks", "foo")) def test_missing(self): kdf = self.kdf missing_functions = inspect.getmembers(_MissingPandasLikeDataFrame, inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*DataFrame.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*DataFrame.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf, name)() - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*DataFrame.*{}.*is deprecated".format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*DataFrame.*{}.*is deprecated".format(name) + ): getattr(kdf, name)() - missing_properties = inspect.getmembers(_MissingPandasLikeDataFrame, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeDataFrame, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*DataFrame.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*DataFrame.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf, name) - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*DataFrame.*{}.*is deprecated".format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*DataFrame.*{}.*is deprecated".format(name) + ): getattr(kdf, name) def test_values_property(self): kdf = self.kdf - msg = ("Koalas does not support the 'values' property. If you want to collect your data " + - "as an NumPy array, use 'to_numpy()' instead.") + msg = ( + "Koalas does not support the 'values' property. If you want to collect your data " + + "as an NumPy array, use 'to_numpy()' instead." + ) with self.assertRaises(NotImplementedError, msg=msg): kdf.values def test_to_numpy(self): - pdf = pd.DataFrame({'a': [4, 2, 3, 4, 8, 6], - 'b': [1, 2, 9, 4, 2, 4], - 'c': ["one", "three", "six", "seven", "one", "5"]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + "a": [4, 2, 3, 4, 8, 6], + "b": [1, 2, 9, 4, 2, 4], + "c": ["one", "three", "six", "seven", "one", "5"], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) @@ -887,15 +983,21 @@ def test_to_pandas(self): self.assert_eq(kdf.to_pandas(), pdf) def test_isin(self): - pdf = pd.DataFrame({'a': [4, 2, 3, 4, 8, 6], - 'b': [1, 2, 9, 4, 2, 4], - 'c': ["one", "three", "six", "seven", "one", "5"]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + "a": [4, 2, 3, 4, 8, 6], + "b": [1, 2, 9, 4, 2, 4], + "c": ["one", "three", "six", "seven", "one", "5"], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.isin([4, 'six']), pdf.isin([4, 'six'])) - self.assert_eq(kdf.isin({"a": [2, 8], "c": ['three', "one"]}), - pdf.isin({"a": [2, 8], "c": ['three', "one"]})) + self.assert_eq(kdf.isin([4, "six"]), pdf.isin([4, "six"])) + self.assert_eq( + kdf.isin({"a": [2, 8], "c": ["three", "one"]}), + pdf.isin({"a": [2, 8], "c": ["three", "one"]}), + ) msg = "'DataFrame' object has no attribute {'e'}" with self.assertRaisesRegex(AttributeError, msg): @@ -910,15 +1012,23 @@ def test_isin(self): kdf.isin(1) def test_merge(self): - left_pdf = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo', 'bar', 'l'], - 'value': [1, 2, 3, 5, 6, 7], - 'x': list('abcdef')}, - columns=['lkey', 'value', 'x']) - right_pdf = pd.DataFrame({'rkey': ['baz', 'foo', 'bar', 'baz', 'foo', 'r'], - 'value': [4, 5, 6, 7, 8, 9], - 'y': list('efghij')}, - columns=['rkey', 'value', 'y']) - right_ps = pd.Series(list('defghi'), name='x', index=[5, 6, 7, 8, 9, 10]) + left_pdf = pd.DataFrame( + { + "lkey": ["foo", "bar", "baz", "foo", "bar", "l"], + "value": [1, 2, 3, 5, 6, 7], + "x": list("abcdef"), + }, + columns=["lkey", "value", "x"], + ) + right_pdf = pd.DataFrame( + { + "rkey": ["baz", "foo", "bar", "baz", "foo", "r"], + "value": [4, 5, 6, 7, 8, 9], + "y": list("efghij"), + }, + columns=["rkey", "value", "y"], + ) + right_ps = pd.Series(list("defghi"), name="x", index=[5, 6, 7, 8, 9, 10]) left_kdf = ks.from_pandas(left_pdf) right_kdf = ks.from_pandas(right_pdf) @@ -935,72 +1045,117 @@ def check(op, right_kdf=right_kdf, right_pdf=right_pdf): self.assert_eq(k_res, p_res) check(lambda left, right: left.merge(right)) - check(lambda left, right: left.merge(right, on='value')) - check(lambda left, right: left.merge(right, left_on='lkey', right_on='rkey')) - check(lambda left, right: left.set_index('lkey').merge(right.set_index('rkey'))) - check(lambda left, right: left.set_index('lkey').merge(right, - left_index=True, right_on='rkey')) - check(lambda left, right: left.merge(right.set_index('rkey'), - left_on='lkey', right_index=True)) - check(lambda left, right: left.set_index('lkey').merge(right.set_index('rkey'), - left_index=True, right_index=True)) + check(lambda left, right: left.merge(right, on="value")) + check(lambda left, right: left.merge(right, left_on="lkey", right_on="rkey")) + check(lambda left, right: left.set_index("lkey").merge(right.set_index("rkey"))) + check( + lambda left, right: left.set_index("lkey").merge( + right, left_index=True, right_on="rkey" + ) + ) + check( + lambda left, right: left.merge( + right.set_index("rkey"), left_on="lkey", right_index=True + ) + ) + check( + lambda left, right: left.set_index("lkey").merge( + right.set_index("rkey"), left_index=True, right_index=True + ) + ) # MultiIndex - check(lambda left, right: left.merge(right, - left_on=['lkey', 'value'], right_on=['rkey', 'value'])) - check(lambda left, right: left.set_index(['lkey', 'value']) - .merge(right, left_index=True, right_on=['rkey', 'value'])) - check(lambda left, right: left.merge( - right.set_index(['rkey', 'value']), left_on=['lkey', 'value'], right_index=True)) + check( + lambda left, right: left.merge( + right, left_on=["lkey", "value"], right_on=["rkey", "value"] + ) + ) + check( + lambda left, right: left.set_index(["lkey", "value"]).merge( + right, left_index=True, right_on=["rkey", "value"] + ) + ) + check( + lambda left, right: left.merge( + right.set_index(["rkey", "value"]), left_on=["lkey", "value"], right_index=True + ) + ) # TODO: when both left_index=True and right_index=True with multi-index # check(lambda left, right: left.set_index(['lkey', 'value']).merge( # right.set_index(['rkey', 'value']), left_index=True, right_index=True)) # join types - for how in ['inner', 'left', 'right', 'outer']: - check(lambda left, right: left.merge(right, on='value', how=how)) - check(lambda left, right: left.merge(right, left_on='lkey', right_on='rkey', how=how)) + for how in ["inner", "left", "right", "outer"]: + check(lambda left, right: left.merge(right, on="value", how=how)) + check(lambda left, right: left.merge(right, left_on="lkey", right_on="rkey", how=how)) # suffix - check(lambda left, right: left.merge(right, left_on='lkey', right_on='rkey', - suffixes=['_left', '_right'])) + check( + lambda left, right: left.merge( + right, left_on="lkey", right_on="rkey", suffixes=["_left", "_right"] + ) + ) # Test Series on the right # pd.DataFrame.merge with Series is implemented since version 0.24.0 if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): check(lambda left, right: left.merge(right), right_kser, right_ps) - check(lambda left, right: left.merge(right, left_on='x', right_on='x'), - right_kser, right_ps) - check(lambda left, right: left.set_index('x').merge(right, left_index=True, - right_on='x'), right_kser, right_ps) + check( + lambda left, right: left.merge(right, left_on="x", right_on="x"), + right_kser, + right_ps, + ) + check( + lambda left, right: left.set_index("x").merge(right, left_index=True, right_on="x"), + right_kser, + right_ps, + ) # Test join types with Series - for how in ['inner', 'left', 'right', 'outer']: + for how in ["inner", "left", "right", "outer"]: check(lambda left, right: left.merge(right, how=how), right_kser, right_ps) - check(lambda left, right: left.merge(right, left_on='x', right_on='x', how=how), - right_kser, right_ps) + check( + lambda left, right: left.merge(right, left_on="x", right_on="x", how=how), + right_kser, + right_ps, + ) # suffix with Series - check(lambda left, right: left.merge(right, suffixes=['_left', '_right'], how='outer', - left_index=True, right_index=True), - right_kser, right_ps) + check( + lambda left, right: left.merge( + right, + suffixes=["_left", "_right"], + how="outer", + left_index=True, + right_index=True, + ), + right_kser, + right_ps, + ) # multi-index columns - left_columns = pd.MultiIndex.from_tuples([('a', 'lkey'), ('a', 'value'), ('b', 'x')]) + left_columns = pd.MultiIndex.from_tuples([("a", "lkey"), ("a", "value"), ("b", "x")]) left_pdf.columns = left_columns left_kdf.columns = left_columns - right_columns = pd.MultiIndex.from_tuples([('a', 'rkey'), ('a', 'value'), ('c', 'y')]) + right_columns = pd.MultiIndex.from_tuples([("a", "rkey"), ("a", "value"), ("c", "y")]) right_pdf.columns = right_columns right_kdf.columns = right_columns check(lambda left, right: left.merge(right)) - check(lambda left, right: left.merge(right, on=[('a', 'value')])) - check(lambda left, right: (left.set_index(('a', 'lkey')) - .merge(right.set_index(('a', 'rkey'))))) - check(lambda left, right: (left.set_index(('a', 'lkey')) - .merge(right.set_index(('a', 'rkey')), - left_index=True, right_index=True))) + check(lambda left, right: left.merge(right, on=[("a", "value")])) + check( + lambda left, right: ( + left.set_index(("a", "lkey")).merge(right.set_index(("a", "rkey"))) + ) + ) + check( + lambda left, right: ( + left.set_index(("a", "lkey")).merge( + right.set_index(("a", "rkey")), left_index=True, right_index=True + ) + ) + ) # TODO: when both left_index=True and right_index=True with multi-index columns # check(lambda left, right: left.merge(right, # left_on=[('a', 'lkey')], right_on=[('a', 'rkey')])) @@ -1008,23 +1163,31 @@ def check(op, right_kdf=right_kdf, right_pdf=right_pdf): # .merge(right, left_index=True, right_on=[('a', 'rkey')]))) def test_merge_retains_indices(self): - left_pdf = pd.DataFrame({'A': [0, 1]}) - right_pdf = pd.DataFrame({'B': [1, 2]}, index=[1, 2]) + left_pdf = pd.DataFrame({"A": [0, 1]}) + right_pdf = pd.DataFrame({"B": [1, 2]}, index=[1, 2]) left_kdf = ks.from_pandas(left_pdf) right_kdf = ks.from_pandas(right_pdf) - self.assert_eq(left_kdf.merge(right_kdf, left_index=True, right_index=True), - left_pdf.merge(right_pdf, left_index=True, right_index=True)) - self.assert_eq(left_kdf.merge(right_kdf, left_on='A', right_index=True), - left_pdf.merge(right_pdf, left_on='A', right_index=True)) - self.assert_eq(left_kdf.merge(right_kdf, left_index=True, right_on='B'), - left_pdf.merge(right_pdf, left_index=True, right_on='B')) - self.assert_eq(left_kdf.merge(right_kdf, left_on='A', right_on='B'), - left_pdf.merge(right_pdf, left_on='A', right_on='B')) + self.assert_eq( + left_kdf.merge(right_kdf, left_index=True, right_index=True), + left_pdf.merge(right_pdf, left_index=True, right_index=True), + ) + self.assert_eq( + left_kdf.merge(right_kdf, left_on="A", right_index=True), + left_pdf.merge(right_pdf, left_on="A", right_index=True), + ) + self.assert_eq( + left_kdf.merge(right_kdf, left_index=True, right_on="B"), + left_pdf.merge(right_pdf, left_index=True, right_on="B"), + ) + self.assert_eq( + left_kdf.merge(right_kdf, left_on="A", right_on="B"), + left_pdf.merge(right_pdf, left_on="A", right_on="B"), + ) def test_merge_how_parameter(self): - left_pdf = pd.DataFrame({'A': [1, 2]}) - right_pdf = pd.DataFrame({'B': ['x', 'y']}, index=[1, 2]) + left_pdf = pd.DataFrame({"A": [1, 2]}) + right_pdf = pd.DataFrame({"B": ["x", "y"]}, index=[1, 2]) left_kdf = ks.from_pandas(left_pdf) right_kdf = ks.from_pandas(right_pdf) @@ -1032,79 +1195,80 @@ def test_merge_how_parameter(self): pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True) self.assert_eq( kdf.sort_values(by=list(kdf.columns)).reset_index(drop=True), - pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True)) + pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True), + ) - kdf = left_kdf.merge(right_kdf, left_index=True, right_index=True, how='left') - pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True, how='left') + kdf = left_kdf.merge(right_kdf, left_index=True, right_index=True, how="left") + pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True, how="left") self.assert_eq( kdf.sort_values(by=list(kdf.columns)).reset_index(drop=True), - pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True)) + pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True), + ) - kdf = left_kdf.merge(right_kdf, left_index=True, right_index=True, how='right') - pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True, how='right') + kdf = left_kdf.merge(right_kdf, left_index=True, right_index=True, how="right") + pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True, how="right") self.assert_eq( kdf.sort_values(by=list(kdf.columns)).reset_index(drop=True), - pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True)) + pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True), + ) - kdf = left_kdf.merge(right_kdf, left_index=True, right_index=True, how='outer') - pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True, how='outer') + kdf = left_kdf.merge(right_kdf, left_index=True, right_index=True, how="outer") + pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True, how="outer") self.assert_eq( kdf.sort_values(by=list(kdf.columns)).reset_index(drop=True), - pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True)) + pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True), + ) def test_merge_raises(self): - left = ks.DataFrame({'value': [1, 2, 3, 5, 6], - 'x': list('abcde')}, - columns=['value', 'x'], - index=['foo', 'bar', 'baz', 'foo', 'bar']) - right = ks.DataFrame({'value': [4, 5, 6, 7, 8], - 'y': list('fghij')}, - columns=['value', 'y'], - index=['baz', 'foo', 'bar', 'baz', 'foo']) - - with self.assertRaisesRegex(ValueError, - 'No common columns to perform merge on'): - left[['x']].merge(right[['y']]) - - with self.assertRaisesRegex(ValueError, - 'not a combination of both'): - left.merge(right, on='value', left_on='x') - - with self.assertRaisesRegex(ValueError, - 'Must pass right_on or right_index=True'): - left.merge(right, left_on='x') - - with self.assertRaisesRegex(ValueError, - 'Must pass right_on or right_index=True'): + left = ks.DataFrame( + {"value": [1, 2, 3, 5, 6], "x": list("abcde")}, + columns=["value", "x"], + index=["foo", "bar", "baz", "foo", "bar"], + ) + right = ks.DataFrame( + {"value": [4, 5, 6, 7, 8], "y": list("fghij")}, + columns=["value", "y"], + index=["baz", "foo", "bar", "baz", "foo"], + ) + + with self.assertRaisesRegex(ValueError, "No common columns to perform merge on"): + left[["x"]].merge(right[["y"]]) + + with self.assertRaisesRegex(ValueError, "not a combination of both"): + left.merge(right, on="value", left_on="x") + + with self.assertRaisesRegex(ValueError, "Must pass right_on or right_index=True"): + left.merge(right, left_on="x") + + with self.assertRaisesRegex(ValueError, "Must pass right_on or right_index=True"): left.merge(right, left_index=True) - with self.assertRaisesRegex(ValueError, - 'Must pass left_on or left_index=True'): - left.merge(right, right_on='y') + with self.assertRaisesRegex(ValueError, "Must pass left_on or left_index=True"): + left.merge(right, right_on="y") - with self.assertRaisesRegex(ValueError, - 'Must pass left_on or left_index=True'): + with self.assertRaisesRegex(ValueError, "Must pass left_on or left_index=True"): left.merge(right, right_index=True) - with self.assertRaisesRegex(ValueError, - 'len\\(left_keys\\) must equal len\\(right_keys\\)'): - left.merge(right, left_on='value', right_on=['value', 'y']) + with self.assertRaisesRegex( + ValueError, "len\\(left_keys\\) must equal len\\(right_keys\\)" + ): + left.merge(right, left_on="value", right_on=["value", "y"]) - with self.assertRaisesRegex(ValueError, - 'len\\(left_keys\\) must equal len\\(right_keys\\)'): - left.merge(right, left_on=['value', 'x'], right_on='value') + with self.assertRaisesRegex( + ValueError, "len\\(left_keys\\) must equal len\\(right_keys\\)" + ): + left.merge(right, left_on=["value", "x"], right_on="value") - with self.assertRaisesRegex(ValueError, - "['inner', 'left', 'right', 'full', 'outer']"): - left.merge(right, left_index=True, right_index=True, how='foo') + with self.assertRaisesRegex(ValueError, "['inner', 'left', 'right', 'full', 'outer']"): + left.merge(right, left_index=True, right_index=True, how="foo") - with self.assertRaisesRegex(KeyError, 'id'): - left.merge(right, on='id') + with self.assertRaisesRegex(KeyError, "id"): + left.merge(right, on="id") def test_append(self): - pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) + pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")) kdf = ks.from_pandas(pdf) - other_pdf = pd.DataFrame([[3, 4], [5, 6]], columns=list('BC'), index=[2, 3]) + other_pdf = pd.DataFrame([[3, 4], [5, 6]], columns=list("BC"), index=[2, 3]) other_kdf = ks.from_pandas(other_pdf) self.assert_eq(kdf.append(kdf), pdf.append(pdf)) @@ -1116,7 +1280,7 @@ def test_append(self): # Assert appending a Series fails msg = "DataFrames.append() does not support appending Series to DataFrames" with self.assertRaises(ValueError, msg=msg): - kdf.append(kdf['A']) + kdf.append(kdf["A"]) # Assert using the sort parameter raises an exception msg = "The 'sort' parameter is currently not supported" @@ -1124,42 +1288,51 @@ def test_append(self): kdf.append(kdf, sort=True) # Assert using 'verify_integrity' only raises an exception for overlapping indices - self.assert_eq(kdf.append(other_kdf, verify_integrity=True), - pdf.append(other_pdf, verify_integrity=True)) + self.assert_eq( + kdf.append(other_kdf, verify_integrity=True), + pdf.append(other_pdf, verify_integrity=True), + ) msg = "Indices have overlapping values" with self.assertRaises(ValueError, msg=msg): kdf.append(kdf, verify_integrity=True) # Skip integrity verification when ignore_index=True - self.assert_eq(kdf.append(kdf, ignore_index=True, verify_integrity=True), - pdf.append(pdf, ignore_index=True, verify_integrity=True)) + self.assert_eq( + kdf.append(kdf, ignore_index=True, verify_integrity=True), + pdf.append(pdf, ignore_index=True, verify_integrity=True), + ) # Assert appending multi-index DataFrames - multi_index_pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), - index=[[2, 3], [4, 5]]) + multi_index_pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[[2, 3], [4, 5]]) multi_index_kdf = ks.from_pandas(multi_index_pdf) - other_multi_index_pdf = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'), - index=[[2, 3], [6, 7]]) + other_multi_index_pdf = pd.DataFrame( + [[5, 6], [7, 8]], columns=list("AB"), index=[[2, 3], [6, 7]] + ) other_multi_index_kdf = ks.from_pandas(other_multi_index_pdf) - self.assert_eq(multi_index_kdf.append(multi_index_kdf), - multi_index_pdf.append(multi_index_pdf)) + self.assert_eq( + multi_index_kdf.append(multi_index_kdf), multi_index_pdf.append(multi_index_pdf) + ) # Assert DataFrames with non-matching columns - self.assert_eq(multi_index_kdf.append(other_multi_index_kdf), - multi_index_pdf.append(other_multi_index_pdf)) + self.assert_eq( + multi_index_kdf.append(other_multi_index_kdf), + multi_index_pdf.append(other_multi_index_pdf), + ) # Assert using 'verify_integrity' only raises an exception for overlapping indices - self.assert_eq(multi_index_kdf.append(other_multi_index_kdf, verify_integrity=True), - multi_index_pdf.append(other_multi_index_pdf, verify_integrity=True)) + self.assert_eq( + multi_index_kdf.append(other_multi_index_kdf, verify_integrity=True), + multi_index_pdf.append(other_multi_index_pdf, verify_integrity=True), + ) with self.assertRaises(ValueError, msg=msg): multi_index_kdf.append(multi_index_kdf, verify_integrity=True) # Skip integrity verification when ignore_index=True - self.assert_eq(multi_index_kdf.append(multi_index_kdf, - ignore_index=True, verify_integrity=True), - multi_index_pdf.append(multi_index_pdf, - ignore_index=True, verify_integrity=True)) + self.assert_eq( + multi_index_kdf.append(multi_index_kdf, ignore_index=True, verify_integrity=True), + multi_index_pdf.append(multi_index_pdf, ignore_index=True, verify_integrity=True), + ) # Assert trying to append DataFrames with different index levels msg = "Both DataFrames have to have the same number of index levels" @@ -1167,18 +1340,21 @@ def test_append(self): kdf.append(multi_index_kdf) # Skip index level check when ignore_index=True - self.assert_eq(kdf.append(multi_index_kdf, ignore_index=True), - pdf.append(multi_index_pdf, ignore_index=True)) + self.assert_eq( + kdf.append(multi_index_kdf, ignore_index=True), + pdf.append(multi_index_pdf, ignore_index=True), + ) - columns = pd.MultiIndex.from_tuples([('A', 'X'), ('A', 'Y')]) + columns = pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y")]) pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.append(kdf), pdf.append(pdf)) def test_clip(self): - pdf = pd.DataFrame({'A': [0, 2, 4], 'B': [4, 2, 0], 'X': [-1, 10, 0]}, - index=np.random.rand(3)) + pdf = pd.DataFrame( + {"A": [0, 2, 4], "B": [4, 2, 0], "X": [-1, 10, 0]}, index=np.random.rand(3) + ) kdf = ks.from_pandas(pdf) # Assert list-like values are not accepted for 'lower' and 'upper' @@ -1198,31 +1374,33 @@ def test_clip(self): self.assert_eq(kdf.clip(1, 3), pdf.clip(1, 3)) # Assert behavior on string values - str_kdf = ks.DataFrame({'A': ['a', 'b', 'c']}, index=np.random.rand(3)) + str_kdf = ks.DataFrame({"A": ["a", "b", "c"]}, index=np.random.rand(3)) self.assert_eq(str_kdf.clip(1, 3), str_kdf) def test_binary_operators(self): self.assertRaisesRegex( ValueError, - 'it comes from a different dataframe', - lambda: ks.range(10).add(ks.range(10))) + "it comes from a different dataframe", + lambda: ks.range(10).add(ks.range(10)), + ) self.assertRaisesRegex( ValueError, - 'add with a sequence is currently not supported', - lambda: ks.range(10).add(ks.range(10).id)) + "add with a sequence is currently not supported", + lambda: ks.range(10).add(ks.range(10).id), + ) def test_sample(self): - pdf = pd.DataFrame({'A': [0, 2, 4]}) + pdf = pd.DataFrame({"A": [0, 2, 4]}) kdf = ks.from_pandas(pdf) # Make sure the tests run, but we can't check the result because they are non-deterministic. kdf.sample(frac=0.1) kdf.sample(frac=0.2, replace=True) kdf.sample(frac=0.2, random_state=5) - kdf['A'].sample(frac=0.2) - kdf['A'].sample(frac=0.2, replace=True) - kdf['A'].sample(frac=0.2, random_state=5) + kdf["A"].sample(frac=0.2) + kdf["A"].sample(frac=0.2, replace=True) + kdf["A"].sample(frac=0.2, random_state=5) with self.assertRaises(ValueError): kdf.sample() @@ -1230,172 +1408,188 @@ def test_sample(self): kdf.sample(n=1) def test_add_prefix(self): - pdf = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}, index=np.random.rand(4)) + pdf = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}, index=np.random.rand(4)) kdf = ks.from_pandas(pdf) - self.assert_eq(pdf.add_prefix('col_'), kdf.add_prefix('col_')) + self.assert_eq(pdf.add_prefix("col_"), kdf.add_prefix("col_")) - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(pdf.add_prefix('col_'), kdf.add_prefix('col_')) + self.assert_eq(pdf.add_prefix("col_"), kdf.add_prefix("col_")) def test_add_suffix(self): - pdf = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}, index=np.random.rand(4)) + pdf = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}, index=np.random.rand(4)) kdf = ks.from_pandas(pdf) - self.assert_eq(pdf.add_suffix('_col'), kdf.add_suffix('_col')) + self.assert_eq(pdf.add_suffix("_col"), kdf.add_suffix("_col")) - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(pdf.add_suffix('_col'), kdf.add_suffix('_col')) + self.assert_eq(pdf.add_suffix("_col"), kdf.add_suffix("_col")) def test_join(self): # check basic function - pdf1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], - 'A': ['A0', 'A1', 'A2', 'A3']}, columns=['key', 'A']) - pdf2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'], - 'B': ['B0', 'B1', 'B2']}, columns=['key', 'B']) - kdf1 = ks.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], - 'A': ['A0', 'A1', 'A2', 'A3']}, columns=['key', 'A']) - kdf2 = ks.DataFrame({'key': ['K0', 'K1', 'K2'], - 'B': ['B0', 'B1', 'B2']}, columns=['key', 'B']) - ks1 = ks.Series(['A1', 'A5'], index=[1, 2], name='A') - join_pdf = pdf1.join(pdf2, lsuffix='_left', rsuffix='_right') + pdf1 = pd.DataFrame( + {"key": ["K0", "K1", "K2", "K3"], "A": ["A0", "A1", "A2", "A3"]}, columns=["key", "A"] + ) + pdf2 = pd.DataFrame( + {"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]}, columns=["key", "B"] + ) + kdf1 = ks.DataFrame( + {"key": ["K0", "K1", "K2", "K3"], "A": ["A0", "A1", "A2", "A3"]}, columns=["key", "A"] + ) + kdf2 = ks.DataFrame( + {"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]}, columns=["key", "B"] + ) + ks1 = ks.Series(["A1", "A5"], index=[1, 2], name="A") + join_pdf = pdf1.join(pdf2, lsuffix="_left", rsuffix="_right") join_pdf.sort_values(by=list(join_pdf.columns), inplace=True) - join_kdf = kdf1.join(kdf2, lsuffix='_left', rsuffix='_right') + join_kdf = kdf1.join(kdf2, lsuffix="_left", rsuffix="_right") join_kdf.sort_values(by=list(join_kdf.columns), inplace=True) self.assert_eq(join_pdf, join_kdf) # join with duplicated columns in Series - with self.assertRaisesRegex(ValueError, - "columns overlap but no suffix specified"): - kdf1.join(ks1, how='outer') + with self.assertRaisesRegex(ValueError, "columns overlap but no suffix specified"): + kdf1.join(ks1, how="outer") # join with duplicated columns in DataFrame - with self.assertRaisesRegex(ValueError, - "columns overlap but no suffix specified"): - kdf1.join(kdf2, how='outer') + with self.assertRaisesRegex(ValueError, "columns overlap but no suffix specified"): + kdf1.join(kdf2, how="outer") # check `on` parameter - join_pdf = pdf1.join(pdf2.set_index('key'), on='key', lsuffix='_left', rsuffix='_right') + join_pdf = pdf1.join(pdf2.set_index("key"), on="key", lsuffix="_left", rsuffix="_right") join_pdf.sort_values(by=list(join_pdf.columns), inplace=True) - join_kdf = kdf1.join(kdf2.set_index('key'), on='key', lsuffix='_left', rsuffix='_right') + join_kdf = kdf1.join(kdf2.set_index("key"), on="key", lsuffix="_left", rsuffix="_right") join_kdf.sort_values(by=list(join_kdf.columns), inplace=True) self.assert_eq(join_pdf.reset_index(drop=True), join_kdf.reset_index(drop=True)) # multi-index columns - columns1 = pd.MultiIndex.from_tuples([('x', 'key'), ('Y', 'A')]) - columns2 = pd.MultiIndex.from_tuples([('x', 'key'), ('Y', 'B')]) + columns1 = pd.MultiIndex.from_tuples([("x", "key"), ("Y", "A")]) + columns2 = pd.MultiIndex.from_tuples([("x", "key"), ("Y", "B")]) pdf1.columns = columns1 pdf2.columns = columns2 kdf1.columns = columns1 kdf2.columns = columns2 - join_pdf = pdf1.join(pdf2, lsuffix='_left', rsuffix='_right') + join_pdf = pdf1.join(pdf2, lsuffix="_left", rsuffix="_right") join_pdf.sort_values(by=list(join_pdf.columns), inplace=True) - join_kdf = kdf1.join(kdf2, lsuffix='_left', rsuffix='_right') + join_kdf = kdf1.join(kdf2, lsuffix="_left", rsuffix="_right") join_kdf.sort_values(by=list(join_kdf.columns), inplace=True) self.assert_eq(join_pdf, join_kdf) # check `on` parameter - join_pdf = pdf1.join(pdf2.set_index(('x', 'key')), on=[('x', 'key')], - lsuffix='_left', rsuffix='_right') + join_pdf = pdf1.join( + pdf2.set_index(("x", "key")), on=[("x", "key")], lsuffix="_left", rsuffix="_right" + ) join_pdf.sort_values(by=list(join_pdf.columns), inplace=True) - join_kdf = kdf1.join(kdf2.set_index(('x', 'key')), on=[('x', 'key')], - lsuffix='_left', rsuffix='_right') + join_kdf = kdf1.join( + kdf2.set_index(("x", "key")), on=[("x", "key")], lsuffix="_left", rsuffix="_right" + ) join_kdf.sort_values(by=list(join_kdf.columns), inplace=True) self.assert_eq(join_pdf.reset_index(drop=True), join_kdf.reset_index(drop=True)) def test_replace(self): - pdf = pd.DataFrame({"name": ['Ironman', 'Captain America', 'Thor', 'Hulk'], - "weapon": ['Mark-45', 'Shield', 'Mjolnir', 'Smash']}, - index=np.random.rand(4)) + pdf = pd.DataFrame( + { + "name": ["Ironman", "Captain America", "Thor", "Hulk"], + "weapon": ["Mark-45", "Shield", "Mjolnir", "Smash"], + }, + index=np.random.rand(4), + ) kdf = ks.from_pandas(pdf) - with self.assertRaisesRegex(NotImplementedError, - "replace currently works only for method='pad"): - kdf.replace(method='bfill') - with self.assertRaisesRegex(NotImplementedError, - "replace currently works only when limit=None"): + with self.assertRaisesRegex( + NotImplementedError, "replace currently works only for method='pad" + ): + kdf.replace(method="bfill") + with self.assertRaisesRegex( + NotImplementedError, "replace currently works only when limit=None" + ): kdf.replace(limit=10) - with self.assertRaisesRegex(NotImplementedError, - "replace currently doesn't supports regex"): - kdf.replace(regex='') + with self.assertRaisesRegex( + NotImplementedError, "replace currently doesn't supports regex" + ): + kdf.replace(regex="") with self.assertRaisesRegex(TypeError, "Unsupported type "): kdf.replace(value=(1, 2, 3)) with self.assertRaisesRegex(TypeError, "Unsupported type "): kdf.replace(to_replace=(1, 2, 3)) - with self.assertRaisesRegex(ValueError, 'Length of to_replace and value must be same'): - kdf.replace(to_replace=['Ironman'], value=['Spiderman', 'Doctor Strange']) + with self.assertRaisesRegex(ValueError, "Length of to_replace and value must be same"): + kdf.replace(to_replace=["Ironman"], value=["Spiderman", "Doctor Strange"]) - self.assert_eq(kdf.replace('Ironman', 'Spiderman'), pdf.replace('Ironman', 'Spiderman')) + self.assert_eq(kdf.replace("Ironman", "Spiderman"), pdf.replace("Ironman", "Spiderman")) self.assert_eq( - kdf.replace(['Ironman', 'Captain America'], ['Rescue', 'Hawkeye']), - pdf.replace(['Ironman', 'Captain America'], ['Rescue', 'Hawkeye']) + kdf.replace(["Ironman", "Captain America"], ["Rescue", "Hawkeye"]), + pdf.replace(["Ironman", "Captain America"], ["Rescue", "Hawkeye"]), ) - pdf = pd.DataFrame({'A': [0, 1, 2, 3, 4], - 'B': [5, 6, 7, 8, 9], - 'C': ['a', 'b', 'c', 'd', 'e']}, - index=np.random.rand(5)) + pdf = pd.DataFrame( + {"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"]}, + index=np.random.rand(5), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.replace([0, 1, 2, 3, 5, 6], 4), - pdf.replace([0, 1, 2, 3, 5, 6], 4)) + self.assert_eq(kdf.replace([0, 1, 2, 3, 5, 6], 4), pdf.replace([0, 1, 2, 3, 5, 6], 4)) - self.assert_eq(kdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]), - pdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1])) + self.assert_eq( + kdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]), + pdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]), + ) - self.assert_eq(kdf.replace({0: 10, 1: 100, 7: 200}), - pdf.replace({0: 10, 1: 100, 7: 200})) + self.assert_eq(kdf.replace({0: 10, 1: 100, 7: 200}), pdf.replace({0: 10, 1: 100, 7: 200})) - self.assert_eq(kdf.replace({'A': 0, 'B': 5}, 100), - pdf.replace({'A': 0, 'B': 5}, 100)) + self.assert_eq(kdf.replace({"A": 0, "B": 5}, 100), pdf.replace({"A": 0, "B": 5}, 100)) - self.assert_eq(kdf.replace({'A': {0: 100, 4: 400}}), - pdf.replace({'A': {0: 100, 4: 400}})) + self.assert_eq(kdf.replace({"A": {0: 100, 4: 400}}), pdf.replace({"A": {0: 100, 4: 400}})) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.replace([0, 1, 2, 3, 5, 6], 4), - pdf.replace([0, 1, 2, 3, 5, 6], 4)) + self.assert_eq(kdf.replace([0, 1, 2, 3, 5, 6], 4), pdf.replace([0, 1, 2, 3, 5, 6], 4)) - self.assert_eq(kdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]), - pdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1])) + self.assert_eq( + kdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]), + pdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]), + ) - self.assert_eq(kdf.replace({0: 10, 1: 100, 7: 200}), - pdf.replace({0: 10, 1: 100, 7: 200})) + self.assert_eq(kdf.replace({0: 10, 1: 100, 7: 200}), pdf.replace({0: 10, 1: 100, 7: 200})) - self.assert_eq(kdf.replace({('X', 'A'): 0, ('X', 'B'): 5}, 100), - pdf.replace({('X', 'A'): 0, ('X', 'B'): 5}, 100)) + self.assert_eq( + kdf.replace({("X", "A"): 0, ("X", "B"): 5}, 100), + pdf.replace({("X", "A"): 0, ("X", "B"): 5}, 100), + ) - self.assert_eq(kdf.replace({('X', 'A'): {0: 100, 4: 400}}), - pdf.replace({('X', 'A'): {0: 100, 4: 400}})) + self.assert_eq( + kdf.replace({("X", "A"): {0: 100, 4: 400}}), pdf.replace({("X", "A"): {0: 100, 4: 400}}) + ) def test_update(self): # check base function def get_data(left_columns=None, right_columns=None): - left_pdf = pd.DataFrame({'A': ['1', '2', '3', '4'], - 'B': ['100', '200', np.nan, np.nan]}, - columns=['A', 'B']) - right_pdf = pd.DataFrame({'B': ['x', np.nan, 'y', np.nan], - 'C': ['100', '200', '300', '400']}, columns=['B', 'C']) - - left_kdf = ks.DataFrame({'A': ['1', '2', '3', '4'], 'B': ['100', '200', None, None]}, - columns=['A', 'B']) - right_kdf = ks.DataFrame({'B': ['x', None, 'y', None], - 'C': ['100', '200', '300', '400']}, columns=['B', 'C']) + left_pdf = pd.DataFrame( + {"A": ["1", "2", "3", "4"], "B": ["100", "200", np.nan, np.nan]}, columns=["A", "B"] + ) + right_pdf = pd.DataFrame( + {"B": ["x", np.nan, "y", np.nan], "C": ["100", "200", "300", "400"]}, + columns=["B", "C"], + ) + + left_kdf = ks.DataFrame( + {"A": ["1", "2", "3", "4"], "B": ["100", "200", None, None]}, columns=["A", "B"] + ) + right_kdf = ks.DataFrame( + {"B": ["x", None, "y", None], "C": ["100", "200", "300", "400"]}, columns=["B", "C"] + ) if left_columns is not None: left_pdf.columns = left_columns left_kdf.columns = left_columns @@ -1407,57 +1601,70 @@ def get_data(left_columns=None, right_columns=None): left_kdf, left_pdf, right_kdf, right_pdf = get_data() left_pdf.update(right_pdf) left_kdf.update(right_kdf) - self.assert_eq(left_pdf.sort_values(by=['A', 'B']), left_kdf.sort_values(by=['A', 'B'])) + self.assert_eq(left_pdf.sort_values(by=["A", "B"]), left_kdf.sort_values(by=["A", "B"])) left_kdf, left_pdf, right_kdf, right_pdf = get_data() left_pdf.update(right_pdf, overwrite=False) left_kdf.update(right_kdf, overwrite=False) - self.assert_eq(left_pdf.sort_values(by=['A', 'B']), left_kdf.sort_values(by=['A', 'B'])) + self.assert_eq(left_pdf.sort_values(by=["A", "B"]), left_kdf.sort_values(by=["A", "B"])) with self.assertRaises(NotImplementedError): - left_kdf.update(right_kdf, join='right') + left_kdf.update(right_kdf, join="right") # multi-index columns - left_columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B')]) - right_columns = pd.MultiIndex.from_tuples([('X', 'B'), ('Y', 'C')]) + left_columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")]) + right_columns = pd.MultiIndex.from_tuples([("X", "B"), ("Y", "C")]) - left_kdf, left_pdf, right_kdf, right_pdf = get_data(left_columns=left_columns, - right_columns=right_columns) + left_kdf, left_pdf, right_kdf, right_pdf = get_data( + left_columns=left_columns, right_columns=right_columns + ) left_pdf.update(right_pdf) left_kdf.update(right_kdf) - self.assert_eq(left_pdf.sort_values(by=[('X', 'A'), ('X', 'B')]), - left_kdf.sort_values(by=[('X', 'A'), ('X', 'B')])) + self.assert_eq( + left_pdf.sort_values(by=[("X", "A"), ("X", "B")]), + left_kdf.sort_values(by=[("X", "A"), ("X", "B")]), + ) - left_kdf, left_pdf, right_kdf, right_pdf = get_data(left_columns=left_columns, - right_columns=right_columns) + left_kdf, left_pdf, right_kdf, right_pdf = get_data( + left_columns=left_columns, right_columns=right_columns + ) left_pdf.update(right_pdf, overwrite=False) left_kdf.update(right_kdf, overwrite=False) - self.assert_eq(left_pdf.sort_values(by=[('X', 'A'), ('X', 'B')]), - left_kdf.sort_values(by=[('X', 'A'), ('X', 'B')])) + self.assert_eq( + left_pdf.sort_values(by=[("X", "A"), ("X", "B")]), + left_kdf.sort_values(by=[("X", "A"), ("X", "B")]), + ) - right_columns = pd.MultiIndex.from_tuples([('Y', 'B'), ('Y', 'C')]) - left_kdf, left_pdf, right_kdf, right_pdf = get_data(left_columns=left_columns, - right_columns=right_columns) + right_columns = pd.MultiIndex.from_tuples([("Y", "B"), ("Y", "C")]) + left_kdf, left_pdf, right_kdf, right_pdf = get_data( + left_columns=left_columns, right_columns=right_columns + ) left_pdf.update(right_pdf) left_kdf.update(right_kdf) - self.assert_eq(left_pdf.sort_values(by=[('X', 'A'), ('X', 'B')]), - left_kdf.sort_values(by=[('X', 'A'), ('X', 'B')])) + self.assert_eq( + left_pdf.sort_values(by=[("X", "A"), ("X", "B")]), + left_kdf.sort_values(by=[("X", "A"), ("X", "B")]), + ) def test_pivot_table_dtypes(self): - pdf = pd.DataFrame({'a': [4, 2, 3, 4, 8, 6], - 'b': [1, 2, 2, 4, 2, 4], - 'e': [1, 2, 2, 4, 2, 4], - 'c': [1, 2, 9, 4, 7, 4]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + "a": [4, 2, 3, 4, 8, 6], + "b": [1, 2, 2, 4, 2, 4], + "e": [1, 2, 2, 4, 2, 4], + "c": [1, 2, 9, 4, 7, 4], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) # Skip columns comparison by reset_index - res_df = kdf.pivot_table(index=['c'], columns="a", values=['b'], - aggfunc={'b': 'mean'}) \ - .dtypes.reset_index(drop=True) - exp_df = pdf.pivot_table(index=['c'], columns="a", values=['b'], - aggfunc={'b': 'mean'}) \ - .dtypes.reset_index(drop=True) + res_df = kdf.pivot_table( + index=["c"], columns="a", values=["b"], aggfunc={"b": "mean"} + ).dtypes.reset_index(drop=True) + exp_df = pdf.pivot_table( + index=["c"], columns="a", values=["b"], aggfunc={"b": "mean"} + ).dtypes.reset_index(drop=True) self.assert_eq(res_df, exp_df) # Results don't have the same column's name @@ -1476,120 +1683,177 @@ def test_pivot_table_dtypes(self): # columns="a", values="b", fill_value=999).dtypes) def test_pivot_table(self): - pdf = pd.DataFrame({'a': [4, 2, 3, 4, 8, 6], - 'b': [1, 2, 2, 4, 2, 4], - 'e': [10, 20, 20, 40, 20, 40], - 'c': [1, 2, 9, 4, 7, 4], - 'd': [-1, -2, -3, -4, -5, -6]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + "a": [4, 2, 3, 4, 8, 6], + "b": [1, 2, 2, 4, 2, 4], + "e": [10, 20, 20, 40, 20, 40], + "c": [1, 2, 9, 4, 7, 4], + "d": [-1, -2, -3, -4, -5, -6], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) # Checking if both DataFrames have the same results - self.assert_eq(kdf.pivot_table(columns="a", values="b").sort_index(), - pdf.pivot_table(columns="a", values="b").sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=['c'], columns="a", values="b").sort_index(), - pdf.pivot_table(index=['c'], columns="a", values="b").sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=['c'], columns="a", values="b", - aggfunc='sum').sort_index(), - pdf.pivot_table(index=['c'], columns="a", values="b", - aggfunc='sum').sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=['c'], columns="a", values=["b"], - aggfunc='sum').sort_index(), - pdf.pivot_table(index=['c'], columns="a", values=["b"], - aggfunc='sum').sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=['c'], columns="a", values=["b", "e"], - aggfunc='sum').sort_index(), - pdf.pivot_table(index=['c'], columns="a", values=["b", "e"], - aggfunc='sum').sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=['c'], columns="a", values=["b", "e", "d"], - aggfunc='sum').sort_index(), - pdf.pivot_table(index=['c'], columns="a", values=["b", "e", "d"], - aggfunc='sum').sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=['c'], columns="a", values=['b', 'e'], - aggfunc={'b': 'mean', 'e': 'sum'}).sort_index(), - pdf.pivot_table(index=['c'], columns="a", values=['b', 'e'], - aggfunc={'b': 'mean', 'e': 'sum'}).sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=['e', 'c'], columns="a", values="b").sort_index(), - pdf.pivot_table(index=['e', 'c'], columns="a", values="b").sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=['e', 'c'], columns="a", values="b", - fill_value=999).sort_index(), - pdf.pivot_table(index=['e', 'c'], columns="a", values="b", - fill_value=999).sort_index(), - almost=True) + self.assert_eq( + kdf.pivot_table(columns="a", values="b").sort_index(), + pdf.pivot_table(columns="a", values="b").sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table(index=["c"], columns="a", values="b").sort_index(), + pdf.pivot_table(index=["c"], columns="a", values="b").sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table(index=["c"], columns="a", values="b", aggfunc="sum").sort_index(), + pdf.pivot_table(index=["c"], columns="a", values="b", aggfunc="sum").sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table(index=["c"], columns="a", values=["b"], aggfunc="sum").sort_index(), + pdf.pivot_table(index=["c"], columns="a", values=["b"], aggfunc="sum").sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table( + index=["c"], columns="a", values=["b", "e"], aggfunc="sum" + ).sort_index(), + pdf.pivot_table( + index=["c"], columns="a", values=["b", "e"], aggfunc="sum" + ).sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table( + index=["c"], columns="a", values=["b", "e", "d"], aggfunc="sum" + ).sort_index(), + pdf.pivot_table( + index=["c"], columns="a", values=["b", "e", "d"], aggfunc="sum" + ).sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table( + index=["c"], columns="a", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"} + ).sort_index(), + pdf.pivot_table( + index=["c"], columns="a", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"} + ).sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table(index=["e", "c"], columns="a", values="b").sort_index(), + pdf.pivot_table(index=["e", "c"], columns="a", values="b").sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table(index=["e", "c"], columns="a", values="b", fill_value=999).sort_index(), + pdf.pivot_table(index=["e", "c"], columns="a", values="b", fill_value=999).sort_index(), + almost=True, + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'e'), - ('z', 'c'), ('w', 'd')]) + columns = pd.MultiIndex.from_tuples( + [("x", "a"), ("x", "b"), ("y", "e"), ("z", "c"), ("w", "d")] + ) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.pivot_table(columns=("x", "a"), values=("x", "b")).sort_index(), - pdf.pivot_table(columns=[("x", "a")], values=[("x", "b")]).sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=[('z', 'c')], columns=("x", "a"), - values=[("x", "b")]).sort_index(), - pdf.pivot_table(index=[('z', 'c')], columns=[("x", "a")], - values=[("x", "b")]).sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=[('z', 'c')], columns=("x", "a"), - values=[("x", "b"), ('y', 'e')]).sort_index(), - pdf.pivot_table(index=[('z', 'c')], columns=[("x", "a")], - values=[("x", "b"), ('y', 'e')]).sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=[('z', 'c')], columns=("x", "a"), - values=[("x", "b"), ('y', 'e'), ('w', 'd')]).sort_index(), - pdf.pivot_table(index=[('z', 'c')], columns=[("x", "a")], - values=[("x", "b"), ('y', 'e'), ('w', 'd')]).sort_index(), - almost=True) - - self.assert_eq(kdf.pivot_table(index=[('z', 'c')], columns=("x", "a"), - values=[("x", "b"), ('y', 'e')], - aggfunc={("x", "b"): 'mean', - ('y', 'e'): 'sum'}).sort_index(), - pdf.pivot_table(index=[('z', 'c')], columns=[("x", "a")], - values=[("x", "b"), ('y', 'e')], - aggfunc={("x", "b"): 'mean', - ('y', 'e'): 'sum'}).sort_index(), - almost=True) + self.assert_eq( + kdf.pivot_table(columns=("x", "a"), values=("x", "b")).sort_index(), + pdf.pivot_table(columns=[("x", "a")], values=[("x", "b")]).sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table( + index=[("z", "c")], columns=("x", "a"), values=[("x", "b")] + ).sort_index(), + pdf.pivot_table( + index=[("z", "c")], columns=[("x", "a")], values=[("x", "b")] + ).sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table( + index=[("z", "c")], columns=("x", "a"), values=[("x", "b"), ("y", "e")] + ).sort_index(), + pdf.pivot_table( + index=[("z", "c")], columns=[("x", "a")], values=[("x", "b"), ("y", "e")] + ).sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table( + index=[("z", "c")], columns=("x", "a"), values=[("x", "b"), ("y", "e"), ("w", "d")] + ).sort_index(), + pdf.pivot_table( + index=[("z", "c")], + columns=[("x", "a")], + values=[("x", "b"), ("y", "e"), ("w", "d")], + ).sort_index(), + almost=True, + ) + + self.assert_eq( + kdf.pivot_table( + index=[("z", "c")], + columns=("x", "a"), + values=[("x", "b"), ("y", "e")], + aggfunc={("x", "b"): "mean", ("y", "e"): "sum"}, + ).sort_index(), + pdf.pivot_table( + index=[("z", "c")], + columns=[("x", "a")], + values=[("x", "b"), ("y", "e")], + aggfunc={("x", "b"): "mean", ("y", "e"): "sum"}, + ).sort_index(), + almost=True, + ) def test_pivot_table_and_index(self): # https://github.com/databricks/koalas/issues/805 - pdf = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", - "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", - "one", "one", "two", "two"], - "C": ["small", "large", "large", "small", - "small", "large", "small", "small", - "large"], - "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}, - columns=['A', 'B', 'C', 'D', 'E'], - index=np.random.rand(9)) - kdf = ks.from_pandas(pdf) - - ptable = pdf.pivot_table(values='D', index=['A', 'B'], - columns='C', aggfunc='sum', fill_value=0).sort_index() - ktable = kdf.pivot_table(values='D', index=['A', 'B'], - columns='C', aggfunc='sum', fill_value=0).sort_index() + pdf = pd.DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + }, + columns=["A", "B", "C", "D", "E"], + index=np.random.rand(9), + ) + kdf = ks.from_pandas(pdf) + + ptable = pdf.pivot_table( + values="D", index=["A", "B"], columns="C", aggfunc="sum", fill_value=0 + ).sort_index() + ktable = kdf.pivot_table( + values="D", index=["A", "B"], columns="C", aggfunc="sum", fill_value=0 + ).sort_index() self.assert_eq(ktable, ptable) self.assert_eq(ktable.index, ptable.index) @@ -1599,22 +1863,26 @@ def test_pivot_errors(self): kdf = ks.range(10) with self.assertRaisesRegex(ValueError, "columns should be set"): - kdf.pivot(index='id') + kdf.pivot(index="id") with self.assertRaisesRegex(ValueError, "values should be set"): - kdf.pivot(index='id', columns="id") + kdf.pivot(index="id", columns="id") def test_pivot_table_errors(self): - pdf = pd.DataFrame({'a': [4, 2, 3, 4, 8, 6], - 'b': [1, 2, 2, 4, 2, 4], - 'e': [1, 2, 2, 4, 2, 4], - 'c': [1, 2, 9, 4, 7, 4]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + "a": [4, 2, 3, 4, 8, 6], + "b": [1, 2, 2, 4, 2, 4], + "e": [1, 2, 2, 4, 2, 4], + "c": [1, 2, 9, 4, 7, 4], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) msg = "values should be string or list of one column." with self.assertRaisesRegex(ValueError, msg): - kdf.pivot_table(index=['c'], columns="a", values=5) + kdf.pivot_table(index=["c"], columns="a", values=5) msg = "index should be a None or a list of columns." with self.assertRaisesRegex(ValueError, msg): @@ -1622,107 +1890,117 @@ def test_pivot_table_errors(self): msg = "pivot_table doesn't support aggfunc as dict and without index." with self.assertRaisesRegex(NotImplementedError, msg): - kdf.pivot_table(columns="a", values=['b', 'e'], aggfunc={'b': 'mean', 'e': 'sum'}) + kdf.pivot_table(columns="a", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"}) msg = "columns should be string." with self.assertRaisesRegex(ValueError, msg): - kdf.pivot_table(columns=["a"], values=['b'], aggfunc={'b': 'mean', 'e': 'sum'}) + kdf.pivot_table(columns=["a"], values=["b"], aggfunc={"b": "mean", "e": "sum"}) msg = "Columns in aggfunc must be the same as values." with self.assertRaisesRegex(ValueError, msg): - kdf.pivot_table(index=['e', 'c'], columns="a", values='b', - aggfunc={'b': 'mean', 'e': 'sum'}) + kdf.pivot_table( + index=["e", "c"], columns="a", values="b", aggfunc={"b": "mean", "e": "sum"} + ) msg = "values can't be a list without index." with self.assertRaisesRegex(NotImplementedError, msg): - kdf.pivot_table(columns="a", values=['b', 'e']) + kdf.pivot_table(columns="a", values=["b", "e"]) msg = "Wrong columns A." with self.assertRaisesRegex(ValueError, msg): - kdf.pivot_table(index=['c'], columns="A", values=['b', 'e'], - aggfunc={'b': 'mean', 'e': 'sum'}) - - kdf = ks.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", - "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", - "one", "one", "two", "two"], - "C": ["small", "large", "large", "small", - "small", "large", "small", "small", - "large"], - "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}, - columns=['A', 'B', 'C', 'D', 'E'], - index=np.random.rand(9)) + kdf.pivot_table( + index=["c"], columns="A", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"} + ) + + kdf = ks.DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + }, + columns=["A", "B", "C", "D", "E"], + index=np.random.rand(9), + ) msg = "values should be a numeric type." with self.assertRaisesRegex(TypeError, msg): - kdf.pivot_table(index=['C'], columns="A", values=['B', 'E'], - aggfunc={'B': 'mean', 'E': 'sum'}) + kdf.pivot_table( + index=["C"], columns="A", values=["B", "E"], aggfunc={"B": "mean", "E": "sum"} + ) msg = "values should be a numeric type." with self.assertRaisesRegex(TypeError, msg): - kdf.pivot_table(index=['C'], columns="A", values='B', - aggfunc={'B': 'mean'}) + kdf.pivot_table(index=["C"], columns="A", values="B", aggfunc={"B": "mean"}) def test_transpose(self): # TODO: what if with random index? - pdf1 = pd.DataFrame( - data={'col1': [1, 2], 'col2': [3, 4]}, - columns=['col1', 'col2']) + pdf1 = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}, columns=["col1", "col2"]) kdf1 = ks.from_pandas(pdf1) pdf2 = pd.DataFrame( - data={'score': [9, 8], 'kids': [0, 0], 'age': [12, 22]}, - columns=['score', 'kids', 'age']) + data={"score": [9, 8], "kids": [0, 0], "age": [12, 22]}, + columns=["score", "kids", "age"], + ) kdf2 = ks.from_pandas(pdf2) - self.assertEqual( - repr(pdf1.transpose().sort_index()), - repr(kdf1.transpose().sort_index())) + self.assertEqual(repr(pdf1.transpose().sort_index()), repr(kdf1.transpose().sort_index())) - self.assert_eq( - repr(pdf2.transpose().sort_index()), - repr(kdf2.transpose().sort_index())) + self.assert_eq(repr(pdf2.transpose().sort_index()), repr(kdf2.transpose().sort_index())) with option_context("compute.max_rows", None): self.assertEqual( - repr(pdf1.transpose().sort_index()), - repr(kdf1.transpose().sort_index())) - - self.assert_eq( - repr(pdf2.transpose().sort_index()), - repr(kdf2.transpose().sort_index())) - - pdf3 = pd.DataFrame({('cg1', 'a'): [1, 2, 3], ('cg1', 'b'): [4, 5, 6], - ('cg2', 'c'): [7, 8, 9], ('cg3', 'd'): [9, 9, 9]}, - index=pd.MultiIndex.from_tuples([('rg1', 'x'), ('rg1', 'y'), - ('rg2', 'z')])) + repr(pdf1.transpose().sort_index()), repr(kdf1.transpose().sort_index()) + ) + + self.assert_eq(repr(pdf2.transpose().sort_index()), repr(kdf2.transpose().sort_index())) + + pdf3 = pd.DataFrame( + { + ("cg1", "a"): [1, 2, 3], + ("cg1", "b"): [4, 5, 6], + ("cg2", "c"): [7, 8, 9], + ("cg3", "d"): [9, 9, 9], + }, + index=pd.MultiIndex.from_tuples([("rg1", "x"), ("rg1", "y"), ("rg2", "z")]), + ) kdf3 = ks.from_pandas(pdf3) - self.assertEqual( - repr(pdf3.transpose().sort_index()), - repr(kdf3.transpose().sort_index())) + self.assertEqual(repr(pdf3.transpose().sort_index()), repr(kdf3.transpose().sort_index())) with option_context("compute.max_rows", None): self.assertEqual( - repr(pdf3.transpose().sort_index()), - repr(kdf3.transpose().sort_index())) + repr(pdf3.transpose().sort_index()), repr(kdf3.transpose().sort_index()) + ) def _test_cummin(self, pdf, kdf): self.assert_eq(pdf.cummin(), kdf.cummin()) self.assert_eq(pdf.cummin(skipna=False), kdf.cummin(skipna=False)) def test_cummin(self): - pdf = pd.DataFrame([[2.0, 1.0], [5, None], [1.0, 0.0], [2.0, 4.0], [4.0, 9.0]], - columns=list('AB'), index=np.random.rand(5)) + pdf = pd.DataFrame( + [[2.0, 1.0], [5, None], [1.0, 0.0], [2.0, 4.0], [4.0, 9.0]], + columns=list("AB"), + index=np.random.rand(5), + ) kdf = ks.from_pandas(pdf) self._test_cummin(pdf, kdf) def test_cummin_multiindex_columns(self): - arrays = [np.array(['A', 'A', 'B', 'B']), - np.array(['one', 'two', 'one', 'two'])] - pdf = pd.DataFrame(np.random.randn(3, 4), index=['A', 'C', 'B'], columns=arrays) - pdf.at['C', ('A', 'two')] = None + arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])] + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "C", "B"], columns=arrays) + pdf.at["C", ("A", "two")] = None kdf = ks.from_pandas(pdf) self._test_cummin(pdf, kdf) @@ -1731,16 +2009,18 @@ def _test_cummax(self, pdf, kdf): self.assert_eq(pdf.cummax(skipna=False), kdf.cummax(skipna=False)) def test_cummax(self): - pdf = pd.DataFrame([[2.0, 1.0], [5, None], [1.0, 0.0], [2.0, 4.0], [4.0, 9.0]], - columns=list('AB'), index=np.random.rand(5)) + pdf = pd.DataFrame( + [[2.0, 1.0], [5, None], [1.0, 0.0], [2.0, 4.0], [4.0, 9.0]], + columns=list("AB"), + index=np.random.rand(5), + ) kdf = ks.from_pandas(pdf) self._test_cummax(pdf, kdf) def test_cummax_multiindex_columns(self): - arrays = [np.array(['A', 'A', 'B', 'B']), - np.array(['one', 'two', 'one', 'two'])] - pdf = pd.DataFrame(np.random.randn(3, 4), index=['A', 'C', 'B'], columns=arrays) - pdf.at['C', ('A', 'two')] = None + arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])] + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "C", "B"], columns=arrays) + pdf.at["C", ("A", "two")] = None kdf = ks.from_pandas(pdf) self._test_cummax(pdf, kdf) @@ -1749,16 +2029,18 @@ def _test_cumsum(self, pdf, kdf): self.assert_eq(pdf.cumsum(skipna=False), kdf.cumsum(skipna=False)) def test_cumsum(self): - pdf = pd.DataFrame([[2.0, 1.0], [5, None], [1.0, 0.0], [2.0, 4.0], [4.0, 9.0]], - columns=list('AB'), index=np.random.rand(5)) + pdf = pd.DataFrame( + [[2.0, 1.0], [5, None], [1.0, 0.0], [2.0, 4.0], [4.0, 9.0]], + columns=list("AB"), + index=np.random.rand(5), + ) kdf = ks.from_pandas(pdf) self._test_cumsum(pdf, kdf) def test_cumsum_multiindex_columns(self): - arrays = [np.array(['A', 'A', 'B', 'B']), - np.array(['one', 'two', 'one', 'two'])] - pdf = pd.DataFrame(np.random.randn(3, 4), index=['A', 'C', 'B'], columns=arrays) - pdf.at['C', ('A', 'two')] = None + arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])] + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "C", "B"], columns=arrays) + pdf.at["C", ("A", "two")] = None kdf = ks.from_pandas(pdf) self._test_cumsum(pdf, kdf) @@ -1767,341 +2049,409 @@ def _test_cumprod(self, pdf, kdf): self.assertEqual(repr(pdf.cumprod(skipna=False)), repr(kdf.cumprod(skipna=False))) def test_cumprod(self): - pdf = pd.DataFrame([[2.0, 1.0], [5, None], [1.0, 1.0], [2.0, 4.0], [4.0, 9.0]], - columns=list('AB'), index=np.random.rand(5)) + pdf = pd.DataFrame( + [[2.0, 1.0], [5, None], [1.0, 1.0], [2.0, 4.0], [4.0, 9.0]], + columns=list("AB"), + index=np.random.rand(5), + ) kdf = ks.from_pandas(pdf) self._test_cumprod(pdf, kdf) def test_cumprod_multiindex_columns(self): - arrays = [np.array(['A', 'A', 'B', 'B']), - np.array(['one', 'two', 'one', 'two'])] - pdf = pd.DataFrame(np.random.rand(3, 4), index=['A', 'C', 'B'], columns=arrays) - pdf.at['C', ('A', 'two')] = None + arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])] + pdf = pd.DataFrame(np.random.rand(3, 4), index=["A", "C", "B"], columns=arrays) + pdf.at["C", ("A", "two")] = None kdf = ks.from_pandas(pdf) self._test_cumprod(pdf, kdf) def test_drop_duplicates(self): - pdf = pd.DataFrame({'a': [1, 2, 2, 2, 3], 'b': ['a', 'a', 'a', 'c', 'd']}, - index=np.random.rand(5)) + pdf = pd.DataFrame( + {"a": [1, 2, 2, 2, 3], "b": ["a", "a", "a", "c", "d"]}, index=np.random.rand(5) + ) kdf = ks.from_pandas(pdf) # inplace is False - self.assert_eq(pdf.drop_duplicates().sort_index(), - kdf.drop_duplicates().sort_index()) - self.assert_eq(pdf.drop_duplicates('a').sort_index(), - kdf.drop_duplicates('a').sort_index()) - self.assert_eq(pdf.drop_duplicates(['a', 'b']).sort_index(), - kdf.drop_duplicates(['a', 'b']).sort_index()) + self.assert_eq(pdf.drop_duplicates().sort_index(), kdf.drop_duplicates().sort_index()) + self.assert_eq(pdf.drop_duplicates("a").sort_index(), kdf.drop_duplicates("a").sort_index()) + self.assert_eq( + pdf.drop_duplicates(["a", "b"]).sort_index(), + kdf.drop_duplicates(["a", "b"]).sort_index(), + ) # multi-index columns, inplace is False - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(pdf.drop_duplicates().sort_index(), - kdf.drop_duplicates().sort_index()) - self.assert_eq(pdf.drop_duplicates(('x', 'a')).sort_index(), - kdf.drop_duplicates(('x', 'a')).sort_index()) - self.assert_eq(pdf.drop_duplicates([('x', 'a'), ('y', 'b')]).sort_index(), - kdf.drop_duplicates([('x', 'a'), ('y', 'b')]).sort_index()) + self.assert_eq(pdf.drop_duplicates().sort_index(), kdf.drop_duplicates().sort_index()) + self.assert_eq( + pdf.drop_duplicates(("x", "a")).sort_index(), + kdf.drop_duplicates(("x", "a")).sort_index(), + ) + self.assert_eq( + pdf.drop_duplicates([("x", "a"), ("y", "b")]).sort_index(), + kdf.drop_duplicates([("x", "a"), ("y", "b")]).sort_index(), + ) # inplace is True - subset_list = [None, 'a', ['a', 'b']] + subset_list = [None, "a", ["a", "b"]] for subset in subset_list: - pdf = pd.DataFrame({'a': [1, 2, 2, 2, 3], 'b': ['a', 'a', 'a', 'c', 'd']}, - index=np.random.rand(5)) + pdf = pd.DataFrame( + {"a": [1, 2, 2, 2, 3], "b": ["a", "a", "a", "c", "d"]}, index=np.random.rand(5) + ) kdf = ks.from_pandas(pdf) pdf.drop_duplicates(subset=subset, inplace=True) kdf.drop_duplicates(subset=subset, inplace=True) - self.assert_eq(pdf.sort_index(), - kdf.sort_index()) + self.assert_eq(pdf.sort_index(), kdf.sort_index()) # multi-index columns, inplace is True - subset_list = [None, ('x', 'a'), [('x', 'a'), ('y', 'b')]] + subset_list = [None, ("x", "a"), [("x", "a"), ("y", "b")]] for subset in subset_list: - pdf = pd.DataFrame({'a': [1, 2, 2, 2, 3], 'b': ['a', 'a', 'a', 'c', 'd']}, - index=np.random.rand(5)) + pdf = pd.DataFrame( + {"a": [1, 2, 2, 2, 3], "b": ["a", "a", "a", "c", "d"]}, index=np.random.rand(5) + ) kdf = ks.from_pandas(pdf) - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")]) pdf.columns = columns kdf.columns = columns pdf.drop_duplicates(subset=subset, inplace=True) kdf.drop_duplicates(subset=subset, inplace=True) - self.assert_eq(pdf.sort_index(), - kdf.sort_index()) + self.assert_eq(pdf.sort_index(), kdf.sort_index()) def test_reindex(self): - index = ['A', 'B', 'C', 'D', 'E'] - pdf = pd.DataFrame({'numbers': [1., 2., 3., 4., 5.]}, index=index) - kdf = ks.DataFrame({'numbers': [1., 2., 3., 4., 5.]}, index=index) + index = ["A", "B", "C", "D", "E"] + pdf = pd.DataFrame({"numbers": [1.0, 2.0, 3.0, 4.0, 5.0]}, index=index) + kdf = ks.DataFrame({"numbers": [1.0, 2.0, 3.0, 4.0, 5.0]}, index=index) self.assert_eq( - pdf.reindex(['A', 'B', 'C'], columns=['numbers', '2', '3']).sort_index(), - kdf.reindex(['A', 'B', 'C'], columns=['numbers', '2', '3']).sort_index()) + pdf.reindex(["A", "B", "C"], columns=["numbers", "2", "3"]).sort_index(), + kdf.reindex(["A", "B", "C"], columns=["numbers", "2", "3"]).sort_index(), + ) self.assert_eq( - pdf.reindex(['A', 'B', 'C'], index=['numbers', '2', '3']).sort_index(), - kdf.reindex(['A', 'B', 'C'], index=['numbers', '2', '3']).sort_index()) + pdf.reindex(["A", "B", "C"], index=["numbers", "2", "3"]).sort_index(), + kdf.reindex(["A", "B", "C"], index=["numbers", "2", "3"]).sort_index(), + ) self.assert_eq( - pdf.reindex(index=['A', 'B']).sort_index(), - kdf.reindex(index=['A', 'B']).sort_index()) + pdf.reindex(index=["A", "B"]).sort_index(), kdf.reindex(index=["A", "B"]).sort_index() + ) self.assert_eq( - pdf.reindex(index=['A', 'B', '2', '3']).sort_index(), - kdf.reindex(index=['A', 'B', '2', '3']).sort_index()) + pdf.reindex(index=["A", "B", "2", "3"]).sort_index(), + kdf.reindex(index=["A", "B", "2", "3"]).sort_index(), + ) self.assert_eq( - pdf.reindex(columns=['numbers']).sort_index(), - kdf.reindex(columns=['numbers']).sort_index()) + pdf.reindex(columns=["numbers"]).sort_index(), + kdf.reindex(columns=["numbers"]).sort_index(), + ) self.assert_eq( - pdf.reindex(columns=['numbers', '2', '3']).sort_index(), - kdf.reindex(columns=['numbers', '2', '3']).sort_index()) + pdf.reindex(columns=["numbers", "2", "3"]).sort_index(), + kdf.reindex(columns=["numbers", "2", "3"]).sort_index(), + ) - self.assertRaises(TypeError, lambda: kdf.reindex(columns=['numbers', '2', '3'], axis=1)) - self.assertRaises(TypeError, lambda: kdf.reindex(columns=['numbers', '2', '3'], axis=2)) - self.assertRaises(TypeError, lambda: kdf.reindex(index=['A', 'B', 'C'], axis=1)) + self.assertRaises(TypeError, lambda: kdf.reindex(columns=["numbers", "2", "3"], axis=1)) + self.assertRaises(TypeError, lambda: kdf.reindex(columns=["numbers", "2", "3"], axis=2)) + self.assertRaises(TypeError, lambda: kdf.reindex(index=["A", "B", "C"], axis=1)) self.assertRaises(TypeError, lambda: kdf.reindex(index=123)) - columns = pd.MultiIndex.from_tuples([('X', 'numbers')]) + columns = pd.MultiIndex.from_tuples([("X", "numbers")]) pdf.columns = columns kdf.columns = columns self.assert_eq( - pdf.reindex(columns=[('X', 'numbers'), ('Y', '2'), ('Y', '3')]).sort_index(), - kdf.reindex(columns=[('X', 'numbers'), ('Y', '2'), ('Y', '3')]).sort_index()) + pdf.reindex(columns=[("X", "numbers"), ("Y", "2"), ("Y", "3")]).sort_index(), + kdf.reindex(columns=[("X", "numbers"), ("Y", "2"), ("Y", "3")]).sort_index(), + ) - self.assertRaises(TypeError, lambda: kdf.reindex(columns=['X'])) - self.assertRaises(ValueError, lambda: kdf.reindex(columns=[('X',)])) + self.assertRaises(TypeError, lambda: kdf.reindex(columns=["X"])) + self.assertRaises(ValueError, lambda: kdf.reindex(columns=[("X",)])) def test_melt(self): - pdf = pd.DataFrame({'A': [1, 3, 5], - 'B': [2, 4, 6], - 'C': [7, 8, 9]}, - index=np.random.rand(3)) - kdf = ks.from_pandas(pdf) - - self.assert_eq(kdf.melt().sort_values(['variable', 'value']) - .reset_index(drop=True), - pdf.melt().sort_values(['variable', 'value'])) - self.assert_eq(kdf.melt(id_vars='A').sort_values(['variable', 'value']) - .reset_index(drop=True), - pdf.melt(id_vars='A').sort_values(['variable', 'value'])) - self.assert_eq(kdf.melt(id_vars=['A', 'B']).sort_values(['variable', 'value']) - .reset_index(drop=True), - pdf.melt(id_vars=['A', 'B']).sort_values(['variable', 'value'])) - self.assert_eq(kdf.melt(id_vars=('A', 'B')).sort_values(['variable', 'value']) - .reset_index(drop=True), - pdf.melt(id_vars=('A', 'B')).sort_values(['variable', 'value'])) - self.assert_eq(kdf.melt(id_vars=['A'], value_vars=['C']).sort_values(['variable', 'value']) - .reset_index(drop=True), - pdf.melt(id_vars=['A'], value_vars=['C']).sort_values(['variable', 'value'])) - self.assert_eq(kdf.melt(id_vars=['A'], value_vars=['B'], - var_name='myVarname', value_name='myValname') - .sort_values(['myVarname', 'myValname']).reset_index(drop=True), - pdf.melt(id_vars=['A'], value_vars=['B'], - var_name='myVarname', value_name='myValname') - .sort_values(['myVarname', 'myValname'])) - self.assert_eq(kdf.melt(value_vars=('A', 'B')).sort_values(['variable', 'value']) - .reset_index(drop=True), - pdf.melt(value_vars=('A', 'B')).sort_values(['variable', 'value'])) - - self.assertRaises(KeyError, lambda: kdf.melt(id_vars='Z')) - self.assertRaises(KeyError, lambda: kdf.melt(value_vars='Z')) + pdf = pd.DataFrame( + {"A": [1, 3, 5], "B": [2, 4, 6], "C": [7, 8, 9]}, index=np.random.rand(3) + ) + kdf = ks.from_pandas(pdf) + + self.assert_eq( + kdf.melt().sort_values(["variable", "value"]).reset_index(drop=True), + pdf.melt().sort_values(["variable", "value"]), + ) + self.assert_eq( + kdf.melt(id_vars="A").sort_values(["variable", "value"]).reset_index(drop=True), + pdf.melt(id_vars="A").sort_values(["variable", "value"]), + ) + self.assert_eq( + kdf.melt(id_vars=["A", "B"]).sort_values(["variable", "value"]).reset_index(drop=True), + pdf.melt(id_vars=["A", "B"]).sort_values(["variable", "value"]), + ) + self.assert_eq( + kdf.melt(id_vars=("A", "B")).sort_values(["variable", "value"]).reset_index(drop=True), + pdf.melt(id_vars=("A", "B")).sort_values(["variable", "value"]), + ) + self.assert_eq( + kdf.melt(id_vars=["A"], value_vars=["C"]) + .sort_values(["variable", "value"]) + .reset_index(drop=True), + pdf.melt(id_vars=["A"], value_vars=["C"]).sort_values(["variable", "value"]), + ) + self.assert_eq( + kdf.melt(id_vars=["A"], value_vars=["B"], var_name="myVarname", value_name="myValname") + .sort_values(["myVarname", "myValname"]) + .reset_index(drop=True), + pdf.melt( + id_vars=["A"], value_vars=["B"], var_name="myVarname", value_name="myValname" + ).sort_values(["myVarname", "myValname"]), + ) + self.assert_eq( + kdf.melt(value_vars=("A", "B")) + .sort_values(["variable", "value"]) + .reset_index(drop=True), + pdf.melt(value_vars=("A", "B")).sort_values(["variable", "value"]), + ) + + self.assertRaises(KeyError, lambda: kdf.melt(id_vars="Z")) + self.assertRaises(KeyError, lambda: kdf.melt(value_vars="Z")) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.melt().sort_values(['variable_0', 'variable_1', 'value']) - .reset_index(drop=True), - pdf.melt().sort_values(['variable_0', 'variable_1', 'value'])) - self.assert_eq(kdf.melt(id_vars=[('X', 'A')]) - .sort_values(['variable_0', 'variable_1', 'value']).reset_index(drop=True), - pdf.melt(id_vars=[('X', 'A')]) - .sort_values(['variable_0', 'variable_1', 'value']), almost=True) - self.assert_eq(kdf.melt(id_vars=[('X', 'A')], value_vars=[('Y', 'C')]) - .sort_values(['variable_0', 'variable_1', 'value']).reset_index(drop=True), - pdf.melt(id_vars=[('X', 'A')], value_vars=[('Y', 'C')]) - .sort_values(['variable_0', 'variable_1', 'value']), almost=True) - self.assert_eq(kdf.melt(id_vars=[('X', 'A')], value_vars=[('X', 'B')], - var_name=['myV1', 'myV2'], value_name='myValname') - .sort_values(['myV1', 'myV2', 'myValname']).reset_index(drop=True), - pdf.melt(id_vars=[('X', 'A')], value_vars=[('X', 'B')], - var_name=['myV1', 'myV2'], value_name='myValname') - .sort_values(['myV1', 'myV2', 'myValname']), almost=True) - - columns.names = ['v0', 'v1'] + self.assert_eq( + kdf.melt().sort_values(["variable_0", "variable_1", "value"]).reset_index(drop=True), + pdf.melt().sort_values(["variable_0", "variable_1", "value"]), + ) + self.assert_eq( + kdf.melt(id_vars=[("X", "A")]) + .sort_values(["variable_0", "variable_1", "value"]) + .reset_index(drop=True), + pdf.melt(id_vars=[("X", "A")]).sort_values(["variable_0", "variable_1", "value"]), + almost=True, + ) + self.assert_eq( + kdf.melt(id_vars=[("X", "A")], value_vars=[("Y", "C")]) + .sort_values(["variable_0", "variable_1", "value"]) + .reset_index(drop=True), + pdf.melt(id_vars=[("X", "A")], value_vars=[("Y", "C")]).sort_values( + ["variable_0", "variable_1", "value"] + ), + almost=True, + ) + self.assert_eq( + kdf.melt( + id_vars=[("X", "A")], + value_vars=[("X", "B")], + var_name=["myV1", "myV2"], + value_name="myValname", + ) + .sort_values(["myV1", "myV2", "myValname"]) + .reset_index(drop=True), + pdf.melt( + id_vars=[("X", "A")], + value_vars=[("X", "B")], + var_name=["myV1", "myV2"], + value_name="myValname", + ).sort_values(["myV1", "myV2", "myValname"]), + almost=True, + ) + + columns.names = ["v0", "v1"] pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.melt().sort_values(['v0', 'v1', 'value']) - .reset_index(drop=True), - pdf.melt().sort_values(['v0', 'v1', 'value'])) + self.assert_eq( + kdf.melt().sort_values(["v0", "v1", "value"]).reset_index(drop=True), + pdf.melt().sort_values(["v0", "v1", "value"]), + ) - self.assertRaises(ValueError, lambda: kdf.melt(id_vars=('X', 'A'))) - self.assertRaises(ValueError, lambda: kdf.melt(value_vars=('X', 'A'))) - self.assertRaises(KeyError, lambda: kdf.melt(id_vars=[('Y', 'A')])) - self.assertRaises(KeyError, lambda: kdf.melt(value_vars=[('Y', 'A')])) + self.assertRaises(ValueError, lambda: kdf.melt(id_vars=("X", "A"))) + self.assertRaises(ValueError, lambda: kdf.melt(value_vars=("X", "A"))) + self.assertRaises(KeyError, lambda: kdf.melt(id_vars=[("Y", "A")])) + self.assertRaises(KeyError, lambda: kdf.melt(value_vars=[("Y", "A")])) def test_all(self): - pdf = pd.DataFrame({ - 'col1': [False, False, False], - 'col2': [True, False, False], - 'col3': [0, 0, 1], - 'col4': [0, 1, 2], - 'col5': [False, False, None], - 'col6': [True, False, None]}, - index=np.random.rand(3)) + pdf = pd.DataFrame( + { + "col1": [False, False, False], + "col2": [True, False, False], + "col3": [0, 0, 1], + "col4": [0, 1, 2], + "col5": [False, False, None], + "col6": [True, False, None], + }, + index=np.random.rand(3), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.all(), pdf.all()) - columns = pd.MultiIndex.from_tuples([('a', 'col1'), ('a', 'col2'), ('a', 'col3'), - ('b', 'col4'), ('b', 'col5'), ('c', 'col6')]) + columns = pd.MultiIndex.from_tuples( + [ + ("a", "col1"), + ("a", "col2"), + ("a", "col3"), + ("b", "col4"), + ("b", "col5"), + ("c", "col6"), + ] + ) pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.all(), pdf.all()) - columns.names = ['X', 'Y'] + columns.names = ["X", "Y"] pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.all(), pdf.all()) with self.assertRaisesRegex( - NotImplementedError, 'axis should be either 0 or "index" currently.'): + NotImplementedError, 'axis should be either 0 or "index" currently.' + ): kdf.all(axis=1) def test_any(self): - pdf = pd.DataFrame({ - 'col1': [False, False, False], - 'col2': [True, False, False], - 'col3': [0, 0, 1], - 'col4': [0, 1, 2], - 'col5': [False, False, None], - 'col6': [True, False, None]}, - index=np.random.rand(3)) + pdf = pd.DataFrame( + { + "col1": [False, False, False], + "col2": [True, False, False], + "col3": [0, 0, 1], + "col4": [0, 1, 2], + "col5": [False, False, None], + "col6": [True, False, None], + }, + index=np.random.rand(3), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.any(), pdf.any()) - columns = pd.MultiIndex.from_tuples([('a', 'col1'), ('a', 'col2'), ('a', 'col3'), - ('b', 'col4'), ('b', 'col5'), ('c', 'col6')]) + columns = pd.MultiIndex.from_tuples( + [ + ("a", "col1"), + ("a", "col2"), + ("a", "col3"), + ("b", "col4"), + ("b", "col5"), + ("c", "col6"), + ] + ) pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.any(), pdf.any()) - columns.names = ['X', 'Y'] + columns.names = ["X", "Y"] pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.any(), pdf.any()) with self.assertRaisesRegex( - NotImplementedError, 'axis should be either 0 or "index" currently.'): + NotImplementedError, 'axis should be either 0 or "index" currently.' + ): kdf.any(axis=1) def test_rank(self): - pdf = pd.DataFrame(data={'col1': [1, 2, 3, 1], 'col2': [3, 4, 3, 1]}, - columns=['col1', 'col2'], index=np.random.rand(4)) - kdf = ks.from_pandas(pdf) - - self.assert_eq(pdf.rank().sort_index(), - kdf.rank().sort_index()) - self.assert_eq(pdf.rank(ascending=False).sort_index(), - kdf.rank(ascending=False).sort_index()) - self.assert_eq(pdf.rank(method='min').sort_index(), - kdf.rank(method='min').sort_index()) - self.assert_eq(pdf.rank(method='max').sort_index(), - kdf.rank(method='max').sort_index()) - self.assert_eq(pdf.rank(method='first').sort_index(), - kdf.rank(method='first').sort_index()) - self.assert_eq(pdf.rank(method='dense').sort_index(), - kdf.rank(method='dense').sort_index()) + pdf = pd.DataFrame( + data={"col1": [1, 2, 3, 1], "col2": [3, 4, 3, 1]}, + columns=["col1", "col2"], + index=np.random.rand(4), + ) + kdf = ks.from_pandas(pdf) + + self.assert_eq(pdf.rank().sort_index(), kdf.rank().sort_index()) + self.assert_eq( + pdf.rank(ascending=False).sort_index(), kdf.rank(ascending=False).sort_index() + ) + self.assert_eq(pdf.rank(method="min").sort_index(), kdf.rank(method="min").sort_index()) + self.assert_eq(pdf.rank(method="max").sort_index(), kdf.rank(method="max").sort_index()) + self.assert_eq(pdf.rank(method="first").sort_index(), kdf.rank(method="first").sort_index()) + self.assert_eq(pdf.rank(method="dense").sort_index(), kdf.rank(method="dense").sort_index()) msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'" with self.assertRaisesRegex(ValueError, msg): - kdf.rank(method='nothing') + kdf.rank(method="nothing") # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'col1'), ('y', 'col2')]) + columns = pd.MultiIndex.from_tuples([("x", "col1"), ("y", "col2")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(pdf.rank().sort_index(), - kdf.rank().sort_index()) + self.assert_eq(pdf.rank().sort_index(), kdf.rank().sort_index()) def test_round(self): - pdf = pd.DataFrame({'A': [0.028208, 0.038683, 0.877076], - 'B': [0.992815, 0.645646, 0.149370], - 'C': [0.173891, 0.577595, 0.491027]}, - columns=['A', 'B', 'C'], index=np.random.rand(3)) - kdf = ks.from_pandas(pdf) - - pser = pd.Series([1, 0, 2], index=['A', 'B', 'C']) - kser = ks.Series([1, 0, 2], index=['A', 'B', 'C']) - self.assert_eq(pdf.round(2), - kdf.round(2)) - self.assert_eq(pdf.round({'A': 1, 'C': 2}), - kdf.round({'A': 1, 'C': 2})) - self.assert_eq(pdf.round({'A': 1, 'D': 2}), - kdf.round({'A': 1, 'D': 2})) - self.assert_eq(pdf.round(pser), - kdf.round(kser)) + pdf = pd.DataFrame( + { + "A": [0.028208, 0.038683, 0.877076], + "B": [0.992815, 0.645646, 0.149370], + "C": [0.173891, 0.577595, 0.491027], + }, + columns=["A", "B", "C"], + index=np.random.rand(3), + ) + kdf = ks.from_pandas(pdf) + + pser = pd.Series([1, 0, 2], index=["A", "B", "C"]) + kser = ks.Series([1, 0, 2], index=["A", "B", "C"]) + self.assert_eq(pdf.round(2), kdf.round(2)) + self.assert_eq(pdf.round({"A": 1, "C": 2}), kdf.round({"A": 1, "C": 2})) + self.assert_eq(pdf.round({"A": 1, "D": 2}), kdf.round({"A": 1, "D": 2})) + self.assert_eq(pdf.round(pser), kdf.round(kser)) msg = "decimals must be an integer, a dict-like or a Series" with self.assertRaisesRegex(ValueError, msg): kdf.round(1.5) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")]) pdf.columns = columns kdf.columns = columns pser = pd.Series([1, 0, 2], index=columns) kser = ks.Series([1, 0, 2], index=columns) - self.assert_eq(pdf.round(2), - kdf.round(2)) - self.assert_eq(pdf.round({('X', 'A'): 1, ('Y', 'C'): 2}), - kdf.round({('X', 'A'): 1, ('Y', 'C'): 2})) - self.assert_eq(pdf.round({('X', 'A'): 1, 'Y': 2}), - kdf.round({('X', 'A'): 1, 'Y': 2})) - self.assert_eq(pdf.round(pser), - kdf.round(kser)) + self.assert_eq(pdf.round(2), kdf.round(2)) + self.assert_eq( + pdf.round({("X", "A"): 1, ("Y", "C"): 2}), kdf.round({("X", "A"): 1, ("Y", "C"): 2}) + ) + self.assert_eq(pdf.round({("X", "A"): 1, "Y": 2}), kdf.round({("X", "A"): 1, "Y": 2})) + self.assert_eq(pdf.round(pser), kdf.round(kser)) def test_shift(self): - pdf = pd.DataFrame({'Col1': [10, 20, 15, 30, 45], - 'Col2': [13, 23, 18, 33, 48], - 'Col3': [17, 27, 22, 37, 52]}, - index=np.random.rand(5)) + pdf = pd.DataFrame( + { + "Col1": [10, 20, 15, 30, 45], + "Col2": [13, 23, 18, 33, 48], + "Col3": [17, 27, 22, 37, 52], + }, + index=np.random.rand(5), + ) kdf = ks.from_pandas(pdf) self.assert_eq(pdf.shift(3), kdf.shift(3)) # Need the expected result since pandas 0.23 does not support `fill_value` argument. - pdf1 = pd.DataFrame({'Col1': [0, 0, 0, 10, 20], - 'Col2': [0, 0, 0, 13, 23], - 'Col3': [0, 0, 0, 17, 27]}, - index=pdf.index) - self.assert_eq(pdf1, - kdf.shift(periods=3, fill_value=0)) + pdf1 = pd.DataFrame( + {"Col1": [0, 0, 0, 10, 20], "Col2": [0, 0, 0, 13, 23], "Col3": [0, 0, 0, 17, 27]}, + index=pdf.index, + ) + self.assert_eq(pdf1, kdf.shift(periods=3, fill_value=0)) msg = "should be an int" with self.assertRaisesRegex(ValueError, msg): kdf.shift(1.5) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'Col1'), ('x', 'Col2'), ('y', 'Col3')]) + columns = pd.MultiIndex.from_tuples([("x", "Col1"), ("x", "Col2"), ("y", "Col3")]) pdf.columns = columns kdf.columns = columns self.assert_eq(pdf.shift(3), kdf.shift(3)) def test_diff(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], - 'b': [1, 1, 2, 3, 5, 8], - 'c': [1, 4, 9, 16, 25, 36]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 2, 3, 5, 8], "c": [1, 4, 9, 16, 25, 36]}, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) self.assert_eq(pdf.diff(), kdf.diff()) @@ -2114,279 +2464,342 @@ def test_diff(self): kdf.diff(axis=1) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'Col1'), ('x', 'Col2'), ('y', 'Col3')]) + columns = pd.MultiIndex.from_tuples([("x", "Col1"), ("x", "Col2"), ("y", "Col3")]) pdf.columns = columns kdf.columns = columns self.assert_eq(pdf.diff(), kdf.diff()) def test_duplicated(self): - pdf = pd.DataFrame({'a': [1, 1, 1, 3], 'b': [1, 1, 1, 4], 'c': [1, 1, 1, 5]}, - index=np.random.rand(4)) - kdf = ks.from_pandas(pdf) - - self.assert_eq(pd.Series(pdf.duplicated(), name='0').sort_index(), - kdf.duplicated().sort_index()) - self.assert_eq(pd.Series(pdf.duplicated(), name='0').sort_index(), - kdf.duplicated().sort_index()) - self.assert_eq(pd.Series(pdf.duplicated(keep='last'), name='0').sort_index(), - kdf.duplicated(keep='last').sort_index()) - self.assert_eq(pd.Series(pdf.duplicated(keep=False), name='0').sort_index(), - kdf.duplicated(keep=False).sort_index()) - self.assert_eq(pd.Series(pdf.duplicated(subset=['b']), name='0').sort_index(), - kdf.duplicated(subset=['b']).sort_index()) + pdf = pd.DataFrame( + {"a": [1, 1, 1, 3], "b": [1, 1, 1, 4], "c": [1, 1, 1, 5]}, index=np.random.rand(4) + ) + kdf = ks.from_pandas(pdf) + + self.assert_eq( + pd.Series(pdf.duplicated(), name="0").sort_index(), kdf.duplicated().sort_index() + ) + self.assert_eq( + pd.Series(pdf.duplicated(), name="0").sort_index(), kdf.duplicated().sort_index() + ) + self.assert_eq( + pd.Series(pdf.duplicated(keep="last"), name="0").sort_index(), + kdf.duplicated(keep="last").sort_index(), + ) + self.assert_eq( + pd.Series(pdf.duplicated(keep=False), name="0").sort_index(), + kdf.duplicated(keep=False).sort_index(), + ) + self.assert_eq( + pd.Series(pdf.duplicated(subset=["b"]), name="0").sort_index(), + kdf.duplicated(subset=["b"]).sort_index(), + ) with self.assertRaisesRegex(ValueError, "'keep' only support 'first', 'last' and False"): - kdf.duplicated(keep='false') + kdf.duplicated(keep="false") with self.assertRaisesRegex(KeyError, "'d'"): - kdf.duplicated(subset=['d']) + kdf.duplicated(subset=["d"]) - pdf.index.name = 'x' - kdf.index.name = 'x' - self.assert_eq(pd.Series(pdf.duplicated(), name='x').sort_index(), - kdf.duplicated().sort_index()) + pdf.index.name = "x" + kdf.index.name = "x" + self.assert_eq( + pd.Series(pdf.duplicated(), name="x").sort_index(), kdf.duplicated().sort_index() + ) # mutli-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(pd.Series(pdf.duplicated(), name='x').sort_index(), - kdf.duplicated().sort_index()) - self.assert_eq(pd.Series(pdf.duplicated(subset=[('x', 'b')]), name='x').sort_index(), - kdf.duplicated(subset=[('x', 'b')]).sort_index()) + self.assert_eq( + pd.Series(pdf.duplicated(), name="x").sort_index(), kdf.duplicated().sort_index() + ) + self.assert_eq( + pd.Series(pdf.duplicated(subset=[("x", "b")]), name="x").sort_index(), + kdf.duplicated(subset=[("x", "b")]).sort_index(), + ) def test_ffill(self): - pdf = pd.DataFrame({'x': [np.nan, 2, 3, 4, np.nan, 6], - 'y': [1, 2, np.nan, 4, np.nan, np.nan], - 'z': [1, 2, 3, 4, np.nan, np.nan]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + "x": [np.nan, 2, 3, 4, np.nan, 6], + "y": [1, 2, np.nan, 4, np.nan, np.nan], + "z": [1, 2, 3, 4, np.nan, np.nan], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.ffill(), pdf.ffill()) self.assert_eq(kdf.ffill(limit=1), pdf.ffill(limit=1)) def test_bfill(self): - pdf = pd.DataFrame({'x': [np.nan, 2, 3, 4, np.nan, 6], - 'y': [1, 2, np.nan, 4, np.nan, np.nan], - 'z': [1, 2, 3, 4, np.nan, np.nan]}, - index=np.random.rand(6)) + pdf = pd.DataFrame( + { + "x": [np.nan, 2, 3, 4, np.nan, 6], + "y": [1, 2, np.nan, 4, np.nan, np.nan], + "z": [1, 2, 3, 4, np.nan, np.nan], + }, + index=np.random.rand(6), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.bfill(), pdf.bfill()) self.assert_eq(kdf.bfill(limit=1), pdf.bfill(limit=1)) def test_filter(self): - pdf = pd.DataFrame({ - 'aa': ['aa', 'bd', 'bc', 'ab', 'ce'], - 'ba': [1, 2, 3, 4, 5], - 'cb': [1., 2., 3., 4., 5.], - 'db': [1., np.nan, 3., np.nan, 5.], - }) - pdf = pdf.set_index('aa') + pdf = pd.DataFrame( + { + "aa": ["aa", "bd", "bc", "ab", "ce"], + "ba": [1, 2, 3, 4, 5], + "cb": [1.0, 2.0, 3.0, 4.0, 5.0], + "db": [1.0, np.nan, 3.0, np.nan, 5.0], + } + ) + pdf = pdf.set_index("aa") kdf = ks.from_pandas(pdf) self.assert_eq( - kdf.filter(items=['ab', 'aa'], axis=0).sort_index(), - pdf.filter(items=['ab', 'aa'], axis=0).sort_index()) + kdf.filter(items=["ab", "aa"], axis=0).sort_index(), + pdf.filter(items=["ab", "aa"], axis=0).sort_index(), + ) self.assert_eq( - kdf.filter(items=['ba', 'db'], axis=1).sort_index(), - pdf.filter(items=['ba', 'db'], axis=1).sort_index()) + kdf.filter(items=["ba", "db"], axis=1).sort_index(), + pdf.filter(items=["ba", "db"], axis=1).sort_index(), + ) - self.assert_eq(kdf.filter(like='b', axis='index'), pdf.filter(like='b', axis='index')) - self.assert_eq(kdf.filter(like='c', axis='columns'), pdf.filter(like='c', axis='columns')) + self.assert_eq(kdf.filter(like="b", axis="index"), pdf.filter(like="b", axis="index")) + self.assert_eq(kdf.filter(like="c", axis="columns"), pdf.filter(like="c", axis="columns")) - self.assert_eq(kdf.filter(regex='b.*', axis='index'), - pdf.filter(regex='b.*', axis='index')) - self.assert_eq(kdf.filter(regex='b.*', axis='columns'), - pdf.filter(regex='b.*', axis='columns')) + self.assert_eq(kdf.filter(regex="b.*", axis="index"), pdf.filter(regex="b.*", axis="index")) + self.assert_eq( + kdf.filter(regex="b.*", axis="columns"), pdf.filter(regex="b.*", axis="columns") + ) - pdf = pdf.set_index('ba', append=True) + pdf = pdf.set_index("ba", append=True) kdf = ks.from_pandas(pdf) with self.assertRaisesRegex(ValueError, "items should be a list-like object"): - kdf.filter(items='b') + kdf.filter(items="b") with self.assertRaisesRegex(ValueError, "Single index must be specified."): - kdf.filter(items=['b'], axis=0) + kdf.filter(items=["b"], axis=0) with self.assertRaisesRegex(ValueError, "Single index must be specified."): - kdf.filter(like='b', axis='index') + kdf.filter(like="b", axis="index") with self.assertRaisesRegex(ValueError, "Single index must be specified."): - kdf.filter(regex='b.*', axis='index') + kdf.filter(regex="b.*", axis="index") with self.assertRaisesRegex(ValueError, "No axis named"): - kdf.filter(regex='b.*', axis=123) + kdf.filter(regex="b.*", axis=123) with self.assertRaisesRegex(TypeError, "Must pass either `items`, `like`"): kdf.filter() with self.assertRaisesRegex(TypeError, "mutually exclusive"): - kdf.filter(regex='b.*', like="aaa") + kdf.filter(regex="b.*", like="aaa") # multi-index columns - pdf = pd.DataFrame({ - ('x', 'aa'): ['aa', 'ab', 'bc', 'bd', 'ce'], - ('x', 'ba'): [1, 2, 3, 4, 5], - ('y', 'cb'): [1., 2., 3., 4., 5.], - ('z', 'db'): [1., np.nan, 3., np.nan, 5.], - }) - pdf = pdf.set_index(('x', 'aa')) + pdf = pd.DataFrame( + { + ("x", "aa"): ["aa", "ab", "bc", "bd", "ce"], + ("x", "ba"): [1, 2, 3, 4, 5], + ("y", "cb"): [1.0, 2.0, 3.0, 4.0, 5.0], + ("z", "db"): [1.0, np.nan, 3.0, np.nan, 5.0], + } + ) + pdf = pdf.set_index(("x", "aa")) kdf = ks.from_pandas(pdf) self.assert_eq( - kdf.filter(items=['ab', 'aa'], axis=0).sort_index(), - pdf.filter(items=['ab', 'aa'], axis=0).sort_index()) + kdf.filter(items=["ab", "aa"], axis=0).sort_index(), + pdf.filter(items=["ab", "aa"], axis=0).sort_index(), + ) self.assert_eq( - kdf.filter(items=[('x', 'ba'), ('z', 'db')], axis=1).sort_index(), - pdf.filter(items=[('x', 'ba'), ('z', 'db')], axis=1).sort_index()) + kdf.filter(items=[("x", "ba"), ("z", "db")], axis=1).sort_index(), + pdf.filter(items=[("x", "ba"), ("z", "db")], axis=1).sort_index(), + ) - self.assert_eq(kdf.filter(like='b', axis='index'), - pdf.filter(like='b', axis='index')) - self.assert_eq(kdf.filter(like='c', axis='columns'), - pdf.filter(like='c', axis='columns')) + self.assert_eq(kdf.filter(like="b", axis="index"), pdf.filter(like="b", axis="index")) + self.assert_eq(kdf.filter(like="c", axis="columns"), pdf.filter(like="c", axis="columns")) - self.assert_eq(kdf.filter(regex='b.*', axis='index'), - pdf.filter(regex='b.*', axis='index')) - self.assert_eq(kdf.filter(regex='b.*', axis='columns'), - pdf.filter(regex='b.*', axis='columns')) + self.assert_eq(kdf.filter(regex="b.*", axis="index"), pdf.filter(regex="b.*", axis="index")) + self.assert_eq( + kdf.filter(regex="b.*", axis="columns"), pdf.filter(regex="b.*", axis="columns") + ) def test_pipe(self): - kdf = ks.DataFrame({'category': ['A', 'A', 'B'], - 'col1': [1, 2, 3], - 'col2': [4, 5, 6]}, - columns=['category', 'col1', 'col2']) + kdf = ks.DataFrame( + {"category": ["A", "A", "B"], "col1": [1, 2, 3], "col2": [4, 5, 6]}, + columns=["category", "col1", "col2"], + ) self.assertRaisesRegex( ValueError, "arg is both the pipe target and a keyword argument", - lambda: kdf.pipe((lambda x: x, 'arg'), arg='1') + lambda: kdf.pipe((lambda x: x, "arg"), arg="1"), ) def test_transform(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 100, - 'b': [1., 1., 2., 3., 5., 8.] * 100, - 'c': [1, 4, 9, 16, 25, 36] * 100}, - columns=['a', 'b', 'c'], - index=np.random.rand(600)) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 100, + "b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100, + "c": [1, 4, 9, 16, 25, 36] * 100, + }, + columns=["a", "b", "c"], + index=np.random.rand(600), + ) kdf = ks.DataFrame(pdf) - self.assert_eq(kdf.transform(lambda x: x + 1).sort_index(), - pdf.transform(lambda x: x + 1).sort_index()) + self.assert_eq( + kdf.transform(lambda x: x + 1).sort_index(), pdf.transform(lambda x: x + 1).sort_index() + ) with option_context("compute.shortcut_limit", 500): - self.assert_eq(kdf.transform(lambda x: x + 1).sort_index(), - pdf.transform(lambda x: x + 1).sort_index()) + self.assert_eq( + kdf.transform(lambda x: x + 1).sort_index(), + pdf.transform(lambda x: x + 1).sort_index(), + ) with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"): kdf.transform(1) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.transform(lambda x: x + 1).sort_index(), - pdf.transform(lambda x: x + 1).sort_index()) + self.assert_eq( + kdf.transform(lambda x: x + 1).sort_index(), pdf.transform(lambda x: x + 1).sort_index() + ) with option_context("compute.shortcut_limit", 500): - self.assert_eq(kdf.transform(lambda x: x + 1).sort_index(), - pdf.transform(lambda x: x + 1).sort_index()) + self.assert_eq( + kdf.transform(lambda x: x + 1).sort_index(), + pdf.transform(lambda x: x + 1).sort_index(), + ) def test_apply(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 100, - 'b': [1., 1., 2., 3., 5., 8.] * 100, - 'c': [1, 4, 9, 16, 25, 36] * 100}, - columns=['a', 'b', 'c'], - index=np.random.rand(600)) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 100, + "b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100, + "c": [1, 4, 9, 16, 25, 36] * 100, + }, + columns=["a", "b", "c"], + index=np.random.rand(600), + ) kdf = ks.DataFrame(pdf) - self.assert_eq(kdf.apply(lambda x: x + 1).sort_index(), - pdf.apply(lambda x: x + 1).sort_index()) + self.assert_eq( + kdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index() + ) with option_context("compute.shortcut_limit", 500): - self.assert_eq(kdf.apply(lambda x: x + 1).sort_index(), - pdf.apply(lambda x: x + 1).sort_index()) + self.assert_eq( + kdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index() + ) # returning a Series - self.assert_eq(kdf.apply(lambda x: len(x), axis=1).sort_index(), - pdf.apply(lambda x: len(x), axis=1).sort_index()) + self.assert_eq( + kdf.apply(lambda x: len(x), axis=1).sort_index(), + pdf.apply(lambda x: len(x), axis=1).sort_index(), + ) with option_context("compute.shortcut_limit", 500): - self.assert_eq(kdf.apply(lambda x: len(x), axis=1).sort_index(), - pdf.apply(lambda x: len(x), axis=1).sort_index()) + self.assert_eq( + kdf.apply(lambda x: len(x), axis=1).sort_index(), + pdf.apply(lambda x: len(x), axis=1).sort_index(), + ) with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"): kdf.apply(1) with self.assertRaisesRegex(TypeError, "The given function.*1 or 'column'; however"): + def f1(_) -> ks.DataFrame[int]: pass + kdf.apply(f1, axis=0) with self.assertRaisesRegex(TypeError, "The given function.*0 or 'index'; however"): + def f2(_) -> ks.Series[int]: pass + kdf.apply(f2, axis=1) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.apply(lambda x: x + 1).sort_index(), - pdf.apply(lambda x: x + 1).sort_index()) + self.assert_eq( + kdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index() + ) with option_context("compute.shortcut_limit", 500): - self.assert_eq(kdf.apply(lambda x: x + 1).sort_index(), - pdf.apply(lambda x: x + 1).sort_index()) + self.assert_eq( + kdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index() + ) # returning a Series - self.assert_eq(kdf.apply(lambda x: len(x), axis=1).sort_index(), - pdf.apply(lambda x: len(x), axis=1).sort_index()) + self.assert_eq( + kdf.apply(lambda x: len(x), axis=1).sort_index(), + pdf.apply(lambda x: len(x), axis=1).sort_index(), + ) with option_context("compute.shortcut_limit", 500): - self.assert_eq(kdf.apply(lambda x: len(x), axis=1).sort_index(), - pdf.apply(lambda x: len(x), axis=1).sort_index()) + self.assert_eq( + kdf.apply(lambda x: len(x), axis=1).sort_index(), + pdf.apply(lambda x: len(x), axis=1).sort_index(), + ) def test_map_in_pandas(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 100, - 'b': [1., 1., 2., 3., 5., 8.] * 100, - 'c': [1, 4, 9, 16, 25, 36] * 100}, - columns=['a', 'b', 'c'], - index=np.random.rand(600)) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 100, + "b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100, + "c": [1, 4, 9, 16, 25, 36] * 100, + }, + columns=["a", "b", "c"], + index=np.random.rand(600), + ) kdf = ks.DataFrame(pdf) - self.assert_eq( - kdf.map_in_pandas(lambda pdf: pdf + 1).sort_index(), - (pdf + 1).sort_index()) + self.assert_eq(kdf.map_in_pandas(lambda pdf: pdf + 1).sort_index(), (pdf + 1).sort_index()) with option_context("compute.shortcut_limit", 500): self.assert_eq( - kdf.map_in_pandas(lambda pdf: pdf + 1).sort_index(), - (pdf + 1).sort_index()) + kdf.map_in_pandas(lambda pdf: pdf + 1).sort_index(), (pdf + 1).sort_index() + ) with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"): kdf.map_in_pandas(1) with self.assertRaisesRegex(TypeError, "The given function.*frame as its type hints"): + def f2(_) -> ks.Series[int]: pass + kdf.map_in_pandas(f2) with self.assertRaisesRegex(ValueError, "The given function should return a frame"): kdf.map_in_pandas(lambda pdf: 1) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq( - kdf.map_in_pandas(lambda x: x + 1).sort_index(), - (pdf + 1).sort_index()) + self.assert_eq(kdf.map_in_pandas(lambda x: x + 1).sort_index(), (pdf + 1).sort_index()) with option_context("compute.shortcut_limit", 500): - self.assert_eq( - kdf.map_in_pandas(lambda x: x + 1).sort_index(), - (pdf + 1).sort_index()) + self.assert_eq(kdf.map_in_pandas(lambda x: x + 1).sort_index(), (pdf + 1).sort_index()) def test_empty_timestamp(self): - pdf = pd.DataFrame({'t': [datetime(2019, 1, 1, 0, 0, 0), - datetime(2019, 1, 2, 0, 0, 0), - datetime(2019, 1, 3, 0, 0, 0)]}, - index=np.random.rand(3)) + pdf = pd.DataFrame( + { + "t": [ + datetime(2019, 1, 1, 0, 0, 0), + datetime(2019, 1, 2, 0, 0, 0), + datetime(2019, 1, 3, 0, 0, 0), + ] + }, + index=np.random.rand(3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf[kdf['t'] != kdf['t']], pdf[pdf['t'] != pdf['t']]) - self.assert_eq(kdf[kdf['t'] != kdf['t']].dtypes, pdf[pdf['t'] != pdf['t']].dtypes) + self.assert_eq(kdf[kdf["t"] != kdf["t"]], pdf[pdf["t"] != pdf["t"]]) + self.assert_eq(kdf[kdf["t"] != kdf["t"]].dtypes, pdf[pdf["t"] != pdf["t"]].dtypes) def test_to_spark(self): kdf = ks.from_pandas(self.pdf) @@ -2398,9 +2811,11 @@ def test_to_spark(self): kdf.to_spark(index_col=["x", "y", "z"]) def test_keys(self): - kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], - index=['cobra', 'viper', 'sidewinder'], - columns=['max_speed', 'shield']) + kdf = ks.DataFrame( + [[1, 2], [4, 5], [7, 8]], + index=["cobra", "viper", "sidewinder"], + columns=["max_speed", "shield"], + ) pdf = kdf.to_pandas() self.assert_eq(kdf.keys(), pdf.keys()) @@ -2409,19 +2824,21 @@ def test_quantile(self): kdf = ks.from_pandas(self.pdf) with self.assertRaisesRegex( - NotImplementedError, 'axis should be either 0 or "index" currently.'): - kdf.quantile(.5, axis=1) + NotImplementedError, 'axis should be either 0 or "index" currently.' + ): + kdf.quantile(0.5, axis=1) with self.assertRaisesRegex( - NotImplementedError, "quantile currently doesn't supports numeric_only"): - kdf.quantile(.5, numeric_only=False) + NotImplementedError, "quantile currently doesn't supports numeric_only" + ): + kdf.quantile(0.5, numeric_only=False) def test_pct_change(self): - kdf = ks.DataFrame({'a': [1, 2, 3, 2], - 'b': [4.0, 2.0, 3.0, 1.0], - 'c': [300, 200, 400, 200]}, - index=np.random.rand(4)) - kdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) + kdf = ks.DataFrame( + {"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0], "c": [300, 200, 400, 200]}, + index=np.random.rand(4), + ) + kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) pdf = kdf.to_pandas() self.assert_eq(repr(kdf.pct_change(2)), repr(pdf.pct_change(2))) @@ -2429,25 +2846,20 @@ def test_pct_change(self): def test_where(self): kdf = ks.from_pandas(self.pdf) - with self.assertRaisesRegex(ValueError, 'type of cond must be a DataFrame or Series'): + with self.assertRaisesRegex(ValueError, "type of cond must be a DataFrame or Series"): kdf.where(1) def test_mask(self): kdf = ks.from_pandas(self.pdf) - with self.assertRaisesRegex(ValueError, 'type of cond must be a DataFrame or Series'): + with self.assertRaisesRegex(ValueError, "type of cond must be a DataFrame or Series"): kdf.mask(1) def test_query(self): - kdf = ks.DataFrame( - {'A': range(1, 6), - 'B': range(10, 0, -2), - 'C': range(10, 5, -1)}) + kdf = ks.DataFrame({"A": range(1, 6), "B": range(10, 0, -2), "C": range(10, 5, -1)}) pdf = kdf.to_pandas() - exprs = ('A > B', - 'A < C', - 'C == B') + exprs = ("A > B", "A < C", "C == B") for expr in exprs: self.assert_eq(kdf.query(expr), pdf.query(expr)) @@ -2465,22 +2877,23 @@ def test_query(self): invalid_exprs = (1, 1.0, (exprs[0],), [exprs[0]]) for expr in invalid_exprs: with self.assertRaisesRegex( - ValueError, - 'expr must be a string to be evaluated, {} given' - .format(type(expr))): + ValueError, "expr must be a string to be evaluated, {} given".format(type(expr)) + ): kdf.query(expr) # invalid values for `inplace` - invalid_inplaces = (1, 0, 'True', 'False') + invalid_inplaces = (1, 0, "True", "False") for inplace in invalid_inplaces: with self.assertRaisesRegex( - ValueError, - 'For argument "inplace" expected type bool, received type {}.' - .format(type(inplace).__name__)): - kdf.query('a < b', inplace=inplace) + ValueError, + 'For argument "inplace" expected type bool, received type {}.'.format( + type(inplace).__name__ + ), + ): + kdf.query("a < b", inplace=inplace) # doesn't support for MultiIndex columns - columns = pd.MultiIndex.from_tuples([('A', 'Z'), ('B', 'X'), ('C', 'C')]) + columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X"), ("C", "C")]) kdf.columns = columns with self.assertRaisesRegex(ValueError, "Doesn't support for MultiIndex columns"): kdf.query("('A', 'Z') > ('B', 'X')") @@ -2491,7 +2904,7 @@ def test_axes(self): self.assert_list_eq(pdf.axes, kdf.axes) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")]) pdf.columns = columns kdf.columns = columns self.assert_list_eq(pdf.axes, kdf.axes) diff --git a/databricks/koalas/tests/test_dataframe_conversion.py b/databricks/koalas/tests/test_dataframe_conversion.py index 9d2a374..7a841a5 100644 --- a/databricks/koalas/tests/test_dataframe_conversion.py +++ b/databricks/koalas/tests/test_dataframe_conversion.py @@ -37,10 +37,7 @@ def tearDown(self): @property def pdf(self): - return pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - }, index=[0, 1, 3]) + return pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6],}, index=[0, 1, 3]) @property def kdf(self): @@ -52,7 +49,8 @@ def strip_all_whitespace(str): return str.translate({ord(c): None for c in string.whitespace}) def test_to_html(self): - expected = self.strip_all_whitespace(""" + expected = self.strip_all_whitespace( + """ @@ -63,12 +61,14 @@ def test_to_html(self):
ab
336
- """) + """ + ) got = self.strip_all_whitespace(self.kdf.to_html()) self.assert_eq(got, expected) # with max_rows set - expected = self.strip_all_whitespace(""" + expected = self.strip_all_whitespace( + """ @@ -78,15 +78,16 @@ def test_to_html(self):
ab
125
- """) + """ + ) got = self.strip_all_whitespace(self.kdf.to_html(max_rows=2)) self.assert_eq(got, expected) @staticmethod def get_excel_dfs(koalas_location, pandas_location): return { - 'got': pd.read_excel(koalas_location, index_col=0), - 'expected': pd.read_excel(pandas_location, index_col=0) + "got": pd.read_excel(koalas_location, index_col=0), + "expected": pd.read_excel(pandas_location, index_col=0), } def test_to_excel(self): @@ -99,59 +100,53 @@ def test_to_excel(self): kdf.to_excel(koalas_location) pdf.to_excel(pandas_location) dataframes = self.get_excel_dfs(koalas_location, pandas_location) - self.assert_eq(dataframes['got'], dataframes['expected']) + self.assert_eq(dataframes["got"], dataframes["expected"]) kdf.a.to_excel(koalas_location) pdf.a.to_excel(pandas_location) dataframes = self.get_excel_dfs(koalas_location, pandas_location) - self.assert_eq(dataframes['got'], dataframes['expected']) + self.assert_eq(dataframes["got"], dataframes["expected"]) - pdf = pd.DataFrame({ - 'a': [1, None, 3], - 'b': ["one", "two", None], - }, index=[0, 1, 3]) + pdf = pd.DataFrame({"a": [1, None, 3], "b": ["one", "two", None],}, index=[0, 1, 3]) kdf = ks.from_pandas(pdf) - kdf.to_excel(koalas_location, na_rep='null') - pdf.to_excel(pandas_location, na_rep='null') + kdf.to_excel(koalas_location, na_rep="null") + pdf.to_excel(pandas_location, na_rep="null") dataframes = self.get_excel_dfs(koalas_location, pandas_location) - self.assert_eq(dataframes['got'], dataframes['expected']) + self.assert_eq(dataframes["got"], dataframes["expected"]) - pdf = pd.DataFrame({ - 'a': [1.0, 2.0, 3.0], - 'b': [4.0, 5.0, 6.0], - }, index=[0, 1, 3]) + pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0],}, index=[0, 1, 3]) kdf = ks.from_pandas(pdf) - kdf.to_excel(koalas_location, float_format='%.1f') - pdf.to_excel(pandas_location, float_format='%.1f') + kdf.to_excel(koalas_location, float_format="%.1f") + pdf.to_excel(pandas_location, float_format="%.1f") dataframes = self.get_excel_dfs(koalas_location, pandas_location) - self.assert_eq(dataframes['got'], dataframes['expected']) + self.assert_eq(dataframes["got"], dataframes["expected"]) kdf.to_excel(koalas_location, header=False) pdf.to_excel(pandas_location, header=False) dataframes = self.get_excel_dfs(koalas_location, pandas_location) - self.assert_eq(dataframes['got'], dataframes['expected']) + self.assert_eq(dataframes["got"], dataframes["expected"]) kdf.to_excel(koalas_location, index=False) pdf.to_excel(pandas_location, index=False) dataframes = self.get_excel_dfs(koalas_location, pandas_location) - self.assert_eq(dataframes['got'], dataframes['expected']) + self.assert_eq(dataframes["got"], dataframes["expected"]) def test_to_json(self): pdf = self.pdf kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.to_json(), pdf.to_json(orient='records')) + self.assert_eq(kdf.to_json(), pdf.to_json(orient="records")) def test_to_json_with_path(self): - pdf = pd.DataFrame({'a': [1], 'b': ['a']}) + pdf = pd.DataFrame({"a": [1], "b": ["a"]}) kdf = ks.DataFrame(pdf) kdf.to_json(self.tmp_dir, num_files=1) - expected = pdf.to_json(orient='records') + expected = pdf.to_json(orient="records") output_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("part-")] assert len(output_paths) > 0 @@ -163,10 +158,10 @@ def test_to_clipboard(self): kdf = self.kdf self.assert_eq(kdf.to_clipboard(), pdf.to_clipboard()) - self.assert_eq(kdf.to_clipboard(excel=False), - pdf.to_clipboard(excel=False)) - self.assert_eq(kdf.to_clipboard(sep=";", index=False), - pdf.to_clipboard(sep=";", index=False)) + self.assert_eq(kdf.to_clipboard(excel=False), pdf.to_clipboard(excel=False)) + self.assert_eq( + kdf.to_clipboard(sep=";", index=False), pdf.to_clipboard(sep=";", index=False) + ) def test_to_latex(self): pdf = self.pdf @@ -176,49 +171,58 @@ def test_to_latex(self): self.assert_eq(kdf.to_latex(col_space=2), pdf.to_latex(col_space=2)) self.assert_eq(kdf.to_latex(header=True), pdf.to_latex(header=True)) self.assert_eq(kdf.to_latex(index=False), pdf.to_latex(index=False)) - self.assert_eq(kdf.to_latex(na_rep='-'), pdf.to_latex(na_rep='-')) - self.assert_eq(kdf.to_latex(float_format='%.1f'), pdf.to_latex(float_format='%.1f')) + self.assert_eq(kdf.to_latex(na_rep="-"), pdf.to_latex(na_rep="-")) + self.assert_eq(kdf.to_latex(float_format="%.1f"), pdf.to_latex(float_format="%.1f")) self.assert_eq(kdf.to_latex(sparsify=False), pdf.to_latex(sparsify=False)) self.assert_eq(kdf.to_latex(index_names=False), pdf.to_latex(index_names=False)) self.assert_eq(kdf.to_latex(bold_rows=True), pdf.to_latex(bold_rows=True)) - self.assert_eq(kdf.to_latex(decimal=','), pdf.to_latex(decimal=',')) + self.assert_eq(kdf.to_latex(decimal=","), pdf.to_latex(decimal=",")) if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - self.assert_eq(kdf.to_latex(encoding='ascii'), pdf.to_latex(encoding='ascii')) + self.assert_eq(kdf.to_latex(encoding="ascii"), pdf.to_latex(encoding="ascii")) def test_to_records(self): if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - pdf = pd.DataFrame({ - 'A': [1, 2], - 'B': [0.5, 0.75] - }, index=['a', 'b']) + pdf = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"]) kdf = ks.from_pandas(pdf) self.assert_array_eq(kdf.to_records(), pdf.to_records()) - self.assert_array_eq(kdf.to_records(index=False), - pdf.to_records(index=False)) - self.assert_array_eq(kdf.to_records(index_dtypes=" i).a, func)().sort_index(), - getattr(pdf.groupby(pdf.b > i).a, func)().sort_index(), - almost=almost) - self.assert_eq(getattr(kdf.groupby(kdf.b > i), func)().sort_index(), - getattr(pdf.groupby(pdf.b > i), func)().sort_index(), - almost=almost) + self.assert_eq( + getattr(kdf.groupby(kdf.b > i).a, func)().sort_index(), + getattr(pdf.groupby(pdf.b > i).a, func)().sort_index(), + almost=almost, + ) + self.assert_eq( + getattr(kdf.groupby(kdf.b > i), func)().sort_index(), + getattr(pdf.groupby(pdf.b > i), func)().sort_index(), + almost=almost, + ) def test_aggregate(self): - pdf = pd.DataFrame({'A': [1, 1, 2, 2], - 'B': [1, 2, 3, 4], - 'C': [0.362, 0.227, 1.267, -0.562]}) + pdf = pd.DataFrame( + {"A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": [0.362, 0.227, 1.267, -0.562]} + ) kdf = ks.from_pandas(pdf) for as_index in [True, False]: - stats_kdf = kdf.groupby('A', as_index=as_index).agg({'B': 'min', 'C': 'sum'}) - stats_pdf = pdf.groupby('A', as_index=as_index).agg({'B': 'min', 'C': 'sum'}) - self.assert_eq(stats_kdf.sort_values(by=['B', 'C']).reset_index(drop=True), - stats_pdf.sort_values(by=['B', 'C']).reset_index(drop=True)) + stats_kdf = kdf.groupby("A", as_index=as_index).agg({"B": "min", "C": "sum"}) + stats_pdf = pdf.groupby("A", as_index=as_index).agg({"B": "min", "C": "sum"}) + self.assert_eq( + stats_kdf.sort_values(by=["B", "C"]).reset_index(drop=True), + stats_pdf.sort_values(by=["B", "C"]).reset_index(drop=True), + ) - stats_kdf = kdf.groupby('A', as_index=as_index).agg({'B': ['min', 'max'], 'C': 'sum'}) - stats_pdf = pdf.groupby('A', as_index=as_index).agg({'B': ['min', 'max'], 'C': 'sum'}) + stats_kdf = kdf.groupby("A", as_index=as_index).agg({"B": ["min", "max"], "C": "sum"}) + stats_pdf = pdf.groupby("A", as_index=as_index).agg({"B": ["min", "max"], "C": "sum"}) self.assert_eq( - stats_kdf.sort_values( - by=[('B', 'min'), ('B', 'max'), ('C', 'sum')] - ).reset_index(drop=True), - stats_pdf.sort_values( - by=[('B', 'min'), ('B', 'max'), ('C', 'sum')] - ).reset_index(drop=True)) - - expected_error_message = (r"aggs must be a dict mapping from column name \(string or " - r"tuple\) to aggregate functions \(string or list of strings\).") + stats_kdf.sort_values(by=[("B", "min"), ("B", "max"), ("C", "sum")]).reset_index( + drop=True + ), + stats_pdf.sort_values(by=[("B", "min"), ("B", "max"), ("C", "sum")]).reset_index( + drop=True + ), + ) + + expected_error_message = ( + r"aggs must be a dict mapping from column name \(string or " + r"tuple\) to aggregate functions \(string or list of strings\)." + ) with self.assertRaisesRegex(ValueError, expected_error_message): - kdf.groupby('A', as_index=as_index).agg(0) + kdf.groupby("A", as_index=as_index).agg(0) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")]) pdf.columns = columns kdf.columns = columns for as_index in [True, False]: - stats_kdf = kdf.groupby( - ('X', 'A'), as_index=as_index).agg({('X', 'B'): 'min', ('Y', 'C'): 'sum'}) - stats_pdf = pdf.groupby( - ('X', 'A'), as_index=as_index).agg({('X', 'B'): 'min', ('Y', 'C'): 'sum'}) + stats_kdf = kdf.groupby(("X", "A"), as_index=as_index).agg( + {("X", "B"): "min", ("Y", "C"): "sum"} + ) + stats_pdf = pdf.groupby(("X", "A"), as_index=as_index).agg( + {("X", "B"): "min", ("Y", "C"): "sum"} + ) self.assert_eq( - stats_kdf.sort_values(by=[('X', 'B'), ('Y', 'C')]).reset_index(drop=True), - stats_pdf.sort_values(by=[('X', 'B'), ('Y', 'C')]).reset_index(drop=True)) + stats_kdf.sort_values(by=[("X", "B"), ("Y", "C")]).reset_index(drop=True), + stats_pdf.sort_values(by=[("X", "B"), ("Y", "C")]).reset_index(drop=True), + ) - stats_kdf = kdf.groupby( - ('X', 'A')).agg({('X', 'B'): ['min', 'max'], ('Y', 'C'): 'sum'}) - stats_pdf = pdf.groupby( - ('X', 'A')).agg({('X', 'B'): ['min', 'max'], ('Y', 'C'): 'sum'}) + stats_kdf = kdf.groupby(("X", "A")).agg({("X", "B"): ["min", "max"], ("Y", "C"): "sum"}) + stats_pdf = pdf.groupby(("X", "A")).agg({("X", "B"): ["min", "max"], ("Y", "C"): "sum"}) self.assert_eq( stats_kdf.sort_values( - by=[('X', 'B', 'min'), ('X', 'B', 'max'), ('Y', 'C', 'sum')] + by=[("X", "B", "min"), ("X", "B", "max"), ("Y", "C", "sum")] ).reset_index(drop=True), stats_pdf.sort_values( - by=[('X', 'B', 'min'), ('X', 'B', 'max'), ('Y', 'C', 'sum')] - ).reset_index(drop=True)) + by=[("X", "B", "min"), ("X", "B", "max"), ("Y", "C", "sum")] + ).reset_index(drop=True), + ) def test_aggregate_func_str_list(self): # this is test for cases where only string or list is assigned - pdf = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], - 'height': [9.1, 6.0, 9.5, 34.0], - 'weight': [7.9, 7.5, 9.9, 198.0]} - ) + pdf = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) kdf = ks.from_pandas(pdf) - agg_funcs = ['max', 'min', ['min', 'max']] + agg_funcs = ["max", "min", ["min", "max"]] for aggfunc in agg_funcs: # Since in koalas groupby, the order of rows might be different # so sort on index to ensure they have same output - sorted_agg_kdf = kdf.groupby('kind').agg(aggfunc).sort_index() - sorted_agg_pdf = pdf.groupby('kind').agg(aggfunc).sort_index() + sorted_agg_kdf = kdf.groupby("kind").agg(aggfunc).sort_index() + sorted_agg_pdf = pdf.groupby("kind").agg(aggfunc).sort_index() self.assert_eq(sorted_agg_kdf, sorted_agg_pdf) # test on multi index column case - pdf = pd.DataFrame({'A': [1, 1, 2, 2], - 'B': [1, 2, 3, 4], - 'C': [0.362, 0.227, 1.267, -0.562]}) + pdf = pd.DataFrame( + {"A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": [0.362, 0.227, 1.267, -0.562]} + ) kdf = ks.from_pandas(pdf) - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")]) pdf.columns = columns kdf.columns = columns for aggfunc in agg_funcs: - sorted_agg_kdf = kdf.groupby(('X', 'A')).agg(aggfunc).sort_index() - sorted_agg_pdf = pdf.groupby(('X', 'A')).agg(aggfunc).sort_index() + sorted_agg_kdf = kdf.groupby(("X", "A")).agg(aggfunc).sort_index() + sorted_agg_pdf = pdf.groupby(("X", "A")).agg(aggfunc).sort_index() self.assert_eq(sorted_agg_kdf, sorted_agg_pdf) @unittest.skipIf(pd.__version__ < "0.25.0", "not supported before pandas 0.25.0") def test_aggregate_relabel(self): # this is to test named aggregation in groupby - pdf = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], - "A": [0, 1, 2, 3], - "B": [5, 6, 7, 8]}) + pdf = pd.DataFrame({"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}) kdf = ks.from_pandas(pdf) # different agg column, same function @@ -271,29 +346,29 @@ def test_aggregate_relabel(self): # test on NamedAgg agg_pdf = ( - pdf.groupby("group") - .agg(b_max=pd.NamedAgg(column="B", aggfunc="max")) - .sort_index() + pdf.groupby("group").agg(b_max=pd.NamedAgg(column="B", aggfunc="max")).sort_index() ) agg_kdf = ( - kdf.groupby("group") - .agg(b_max=ks.NamedAgg(column="B", aggfunc="max")) - .sort_index() + kdf.groupby("group").agg(b_max=ks.NamedAgg(column="B", aggfunc="max")).sort_index() ) self.assert_eq(agg_kdf, agg_pdf) # test on NamedAgg multi columns aggregation agg_pdf = ( pdf.groupby("group") - .agg(b_max=pd.NamedAgg(column="B", aggfunc="max"), - b_min=pd.NamedAgg(column="B", aggfunc="min")) - .sort_index() + .agg( + b_max=pd.NamedAgg(column="B", aggfunc="max"), + b_min=pd.NamedAgg(column="B", aggfunc="min"), + ) + .sort_index() ) agg_kdf = ( kdf.groupby("group") - .agg(b_max=ks.NamedAgg(column="B", aggfunc="max"), - b_min=ks.NamedAgg(column="B", aggfunc="min")) - .sort_index() + .agg( + b_max=ks.NamedAgg(column="B", aggfunc="max"), + b_min=ks.NamedAgg(column="B", aggfunc="min"), + ) + .sort_index() ) self.assert_eq(agg_kdf, agg_pdf) @@ -304,7 +379,7 @@ def test_describe(self): datas.append({"a": [-1, -1, -3], "b": [-4, -5, -6], "c": [-7, -8, -9]}) datas.append({"a": [0, 0, 0], "b": [0, 0, 0], "c": [0, 8, 0]}) # it is okay if string type column as a group key - datas.append({"a": ['a', 'a', 'c'], "b": [4, 5, 6], "c": [7, 8, 9]}) + datas.append({"a": ["a", "a", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}) for data in datas: pdf = pd.DataFrame(data) @@ -319,8 +394,10 @@ def test_describe(self): # 1. Check that non-percentile columns are equal. agg_cols = [col.name for col in kdf.groupby("a")._agg_columns] formatted_percentiles = ["25%", "50%", "75%"] - self.assert_eq(repr(describe_kdf.drop(list(product(agg_cols, formatted_percentiles)))), - repr(describe_pdf.drop(columns=formatted_percentiles, level=1))) + self.assert_eq( + repr(describe_kdf.drop(list(product(agg_cols, formatted_percentiles)))), + repr(describe_pdf.drop(columns=formatted_percentiles, level=1)), + ) # 2. Check that percentile columns are equal. percentiles = [0.25, 0.5, 0.75] @@ -328,13 +405,15 @@ def test_describe(self): quantile_pdf = pdf.groupby("a").quantile(percentiles, interpolation="nearest") quantile_pdf = quantile_pdf.unstack(level=1).astype(float) non_percentile_stats = ["count", "mean", "std", "min", "max"] - self.assert_eq(repr(describe_kdf.drop(list(product(agg_cols, non_percentile_stats)))), - repr(quantile_pdf.rename(columns="{:.0%}".format, level=1))) + self.assert_eq( + repr(describe_kdf.drop(list(product(agg_cols, non_percentile_stats)))), + repr(quantile_pdf.rename(columns="{:.0%}".format, level=1)), + ) # not support for string type yet datas = [] - datas.append({"a": ['a', 'a', 'c'], "b": ['d', 'e', 'f'], "c": ['g', 'h', 'i']}) - datas.append({"a": ['a', 'a', 'c'], "b": [4, 0, 1], "c": ['g', 'h', 'i']}) + datas.append({"a": ["a", "a", "c"], "b": ["d", "e", "f"], "c": ["g", "h", "i"]}) + datas.append({"a": ["a", "a", "c"], "b": [4, 0, 1], "c": ["g", "h", "i"]}) for data in datas: pdf = pd.DataFrame(data) kdf = ks.from_pandas(pdf) @@ -343,32 +422,46 @@ def test_describe(self): self.assertRaises(NotImplementedError, lambda: kdf.groupby("a").describe().sort_index()) def test_all_any(self): - pdf = pd.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], - 'B': [True, True, True, False, False, False, None, True, None, False]}) + pdf = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], + "B": [True, True, True, False, False, False, None, True, None, False], + } + ) kdf = ks.from_pandas(pdf) for as_index in [True, False]: if as_index: sort = lambda df: df.sort_index() else: - sort = lambda df: df.sort_values('A').reset_index(drop=True) - self.assert_eq(sort(kdf.groupby('A', as_index=as_index).all()), - sort(pdf.groupby('A', as_index=as_index).all())) - self.assert_eq(sort(kdf.groupby('A', as_index=as_index).any()), - sort(pdf.groupby('A', as_index=as_index).any())) - - self.assert_eq(sort(kdf.groupby('A', as_index=as_index).all()).B, - sort(pdf.groupby('A', as_index=as_index).all()).B) - self.assert_eq(sort(kdf.groupby('A', as_index=as_index).any()).B, - sort(pdf.groupby('A', as_index=as_index).any()).B) - - self.assert_eq(kdf.B.groupby(kdf.A).all().sort_index(), - pdf.B.groupby(pdf.A).all().sort_index()) - self.assert_eq(kdf.B.groupby(kdf.A).any().sort_index(), - pdf.B.groupby(pdf.A).any().sort_index()) + sort = lambda df: df.sort_values("A").reset_index(drop=True) + self.assert_eq( + sort(kdf.groupby("A", as_index=as_index).all()), + sort(pdf.groupby("A", as_index=as_index).all()), + ) + self.assert_eq( + sort(kdf.groupby("A", as_index=as_index).any()), + sort(pdf.groupby("A", as_index=as_index).any()), + ) + + self.assert_eq( + sort(kdf.groupby("A", as_index=as_index).all()).B, + sort(pdf.groupby("A", as_index=as_index).all()).B, + ) + self.assert_eq( + sort(kdf.groupby("A", as_index=as_index).any()).B, + sort(pdf.groupby("A", as_index=as_index).any()).B, + ) + + self.assert_eq( + kdf.B.groupby(kdf.A).all().sort_index(), pdf.B.groupby(pdf.A).all().sort_index() + ) + self.assert_eq( + kdf.B.groupby(kdf.A).any().sort_index(), pdf.B.groupby(pdf.A).any().sort_index() + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('Y', 'B')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("Y", "B")]) pdf.columns = columns kdf.columns = columns @@ -376,391 +469,583 @@ def test_all_any(self): if as_index: sort = lambda df: df.sort_index() else: - sort = lambda df: df.sort_values(('X', 'A')).reset_index(drop=True) - self.assert_eq(sort(kdf.groupby(('X', 'A'), as_index=as_index).all()), - sort(pdf.groupby(('X', 'A'), as_index=as_index).all())) - self.assert_eq(sort(kdf.groupby(('X', 'A'), as_index=as_index).any()), - sort(pdf.groupby(('X', 'A'), as_index=as_index).any())) + sort = lambda df: df.sort_values(("X", "A")).reset_index(drop=True) + self.assert_eq( + sort(kdf.groupby(("X", "A"), as_index=as_index).all()), + sort(pdf.groupby(("X", "A"), as_index=as_index).all()), + ) + self.assert_eq( + sort(kdf.groupby(("X", "A"), as_index=as_index).any()), + sort(pdf.groupby(("X", "A"), as_index=as_index).any()), + ) def test_raises(self): - kdf = ks.DataFrame({'a': [1, 2, 6, 4, 4, 6, 4, 3, 7], - 'b': [4, 2, 7, 3, 3, 1, 1, 1, 2]}, - index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) + kdf = ks.DataFrame( + {"a": [1, 2, 6, 4, 4, 6, 4, 3, 7], "b": [4, 2, 7, 3, 3, 1, 1, 1, 2]}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) # test raises with incorrect key self.assertRaises(ValueError, lambda: kdf.groupby([])) - self.assertRaises(KeyError, lambda: kdf.groupby('x')) - self.assertRaises(KeyError, lambda: kdf.groupby(['a', 'x'])) - self.assertRaises(KeyError, lambda: kdf.groupby('a')['x']) - self.assertRaises(KeyError, lambda: kdf.groupby('a')['b', 'x']) - self.assertRaises(KeyError, lambda: kdf.groupby('a')[['b', 'x']]) + self.assertRaises(KeyError, lambda: kdf.groupby("x")) + self.assertRaises(KeyError, lambda: kdf.groupby(["a", "x"])) + self.assertRaises(KeyError, lambda: kdf.groupby("a")["x"]) + self.assertRaises(KeyError, lambda: kdf.groupby("a")["b", "x"]) + self.assertRaises(KeyError, lambda: kdf.groupby("a")[["b", "x"]]) def test_nunique(self): - pdf = pd.DataFrame({'a': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0], - 'b': [2, 2, 2, 3, 3, 4, 4, 5, 5, 5]}) + pdf = pd.DataFrame( + {"a": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0], "b": [2, 2, 2, 3, 3, 4, 4, 5, 5, 5]} + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("a").agg({"b": "nunique"}).sort_index(), - pdf.groupby("a").agg({"b": "nunique"}).sort_index()) - self.assert_eq(kdf.groupby("a").nunique().sort_index(), - pdf.groupby("a").nunique().sort_index()) - self.assert_eq(kdf.groupby("a").nunique(dropna=False).sort_index(), - pdf.groupby("a").nunique(dropna=False).sort_index()) - self.assert_eq(kdf.groupby("a")['b'].nunique().sort_index(), - pdf.groupby("a")['b'].nunique().sort_index()) - self.assert_eq(kdf.groupby("a")['b'].nunique(dropna=False).sort_index(), - pdf.groupby("a")['b'].nunique(dropna=False).sort_index()) + self.assert_eq( + kdf.groupby("a").agg({"b": "nunique"}).sort_index(), + pdf.groupby("a").agg({"b": "nunique"}).sort_index(), + ) + self.assert_eq( + kdf.groupby("a").nunique().sort_index(), pdf.groupby("a").nunique().sort_index() + ) + self.assert_eq( + kdf.groupby("a").nunique(dropna=False).sort_index(), + pdf.groupby("a").nunique(dropna=False).sort_index(), + ) + self.assert_eq( + kdf.groupby("a")["b"].nunique().sort_index(), + pdf.groupby("a")["b"].nunique().sort_index(), + ) + self.assert_eq( + kdf.groupby("a")["b"].nunique(dropna=False).sort_index(), + pdf.groupby("a")["b"].nunique(dropna=False).sort_index(), + ) nunique_kdf = kdf.groupby("a", as_index=False).agg({"b": "nunique"}) nunique_pdf = pdf.groupby("a", as_index=False).agg({"b": "nunique"}) self.assert_eq( - nunique_kdf.sort_values(['a', 'b']).reset_index(drop=True), - nunique_pdf.sort_values(['a', 'b']).reset_index(drop=True)) + nunique_kdf.sort_values(["a", "b"]).reset_index(drop=True), + nunique_pdf.sort_values(["a", "b"]).reset_index(drop=True), + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "a")).nunique().sort_index(), - pdf.groupby(("x", "a")).nunique().sort_index()) - self.assert_eq(kdf.groupby(("x", "a")).nunique(dropna=False).sort_index(), - pdf.groupby(("x", "a")).nunique(dropna=False).sort_index()) + self.assert_eq( + kdf.groupby(("x", "a")).nunique().sort_index(), + pdf.groupby(("x", "a")).nunique().sort_index(), + ) + self.assert_eq( + kdf.groupby(("x", "a")).nunique(dropna=False).sort_index(), + pdf.groupby(("x", "a")).nunique(dropna=False).sort_index(), + ) def test_value_counts(self): - pdf = pd.DataFrame({'A': [1, 2, 2, 3, 3, 3], - 'B': [1, 1, 2, 3, 3, 3]}, columns=['A', 'B']) + pdf = pd.DataFrame({"A": [1, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3, 3]}, columns=["A", "B"]) kdf = ks.from_pandas(pdf) - self.assert_eq(repr(kdf.groupby("A")['B'].value_counts().sort_index()), - repr(pdf.groupby("A")['B'].value_counts().sort_index())) - self.assert_eq(repr(kdf.groupby("A")['B'] - .value_counts(sort=True, ascending=False).sort_index()), - repr(pdf.groupby("A")['B'] - .value_counts(sort=True, ascending=False).sort_index())) - self.assert_eq(repr(kdf.groupby("A")['B'] - .value_counts(sort=True, ascending=True).sort_index()), - repr(pdf.groupby("A")['B'] - .value_counts(sort=True, ascending=True).sort_index())) + self.assert_eq( + repr(kdf.groupby("A")["B"].value_counts().sort_index()), + repr(pdf.groupby("A")["B"].value_counts().sort_index()), + ) + self.assert_eq( + repr(kdf.groupby("A")["B"].value_counts(sort=True, ascending=False).sort_index()), + repr(pdf.groupby("A")["B"].value_counts(sort=True, ascending=False).sort_index()), + ) + self.assert_eq( + repr(kdf.groupby("A")["B"].value_counts(sort=True, ascending=True).sort_index()), + repr(pdf.groupby("A")["B"].value_counts(sort=True, ascending=True).sort_index()), + ) def test_size(self): - pdf = pd.DataFrame({'A': [1, 2, 2, 3, 3, 3], - 'B': [1, 1, 2, 3, 3, 3]}) + pdf = pd.DataFrame({"A": [1, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3, 3]}) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("A").size().sort_index(), - pdf.groupby("A").size().sort_index()) - self.assert_eq(kdf.groupby("A")['B'].size().sort_index(), - pdf.groupby("A")['B'].size().sort_index()) - self.assert_eq(kdf.groupby(['A', 'B']).size().sort_index(), - pdf.groupby(['A', 'B']).size().sort_index()) + self.assert_eq(kdf.groupby("A").size().sort_index(), pdf.groupby("A").size().sort_index()) + self.assert_eq( + kdf.groupby("A")["B"].size().sort_index(), pdf.groupby("A")["B"].size().sort_index() + ) + self.assert_eq( + kdf.groupby(["A", "B"]).size().sort_index(), pdf.groupby(["A", "B"]).size().sort_index() + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('Y', 'B')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("Y", "B")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("X", "A")).size().sort_index(), - pdf.groupby(("X", "A")).size().sort_index()) - self.assert_eq(kdf.groupby([('X', 'A'), ('Y', 'B')]).size().sort_index(), - pdf.groupby([('X', 'A'), ('Y', 'B')]).size().sort_index()) + self.assert_eq( + kdf.groupby(("X", "A")).size().sort_index(), pdf.groupby(("X", "A")).size().sort_index() + ) + self.assert_eq( + kdf.groupby([("X", "A"), ("Y", "B")]).size().sort_index(), + pdf.groupby([("X", "A"), ("Y", "B")]).size().sort_index(), + ) def test_diff(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 3, - 'b': [1, 1, 2, 3, 5, 8] * 3, - 'c': [1, 4, 9, 16, 25, 36] * 3}) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 3, + "b": [1, 1, 2, 3, 5, 8] * 3, + "c": [1, 4, 9, 16, 25, 36] * 3, + } + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").diff().sort_index(), - pdf.groupby("b").diff().sort_index()) - self.assert_eq(kdf.groupby(['a', 'b']).diff().sort_index(), - pdf.groupby(['a', 'b']).diff().sort_index()) - self.assert_eq(kdf.groupby(['b'])['a'].diff().sort_index(), - pdf.groupby(['b'])['a'].diff().sort_index(), almost=True) - self.assert_eq(kdf.groupby(['b'])[['a', 'b']].diff().sort_index(), - pdf.groupby(['b'])[['a', 'b']].diff().sort_index(), almost=True) + self.assert_eq(kdf.groupby("b").diff().sort_index(), pdf.groupby("b").diff().sort_index()) + self.assert_eq( + kdf.groupby(["a", "b"]).diff().sort_index(), pdf.groupby(["a", "b"]).diff().sort_index() + ) + self.assert_eq( + kdf.groupby(["b"])["a"].diff().sort_index(), + pdf.groupby(["b"])["a"].diff().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby(["b"])[["a", "b"]].diff().sort_index(), + pdf.groupby(["b"])[["a", "b"]].diff().sort_index(), + almost=True, + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")).diff().sort_index(), - pdf.groupby(("x", "b")).diff().sort_index()) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]).diff().sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]).diff().sort_index()) + self.assert_eq( + kdf.groupby(("x", "b")).diff().sort_index(), pdf.groupby(("x", "b")).diff().sort_index() + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]).diff().sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).diff().sort_index(), + ) def test_rank(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 3, - 'b': [1, 1, 2, 3, 5, 8] * 3, - 'c': [1, 4, 9, 16, 25, 36] * 3}, - index=np.random.rand(6 * 3)) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 3, + "b": [1, 1, 2, 3, 5, 8] * 3, + "c": [1, 4, 9, 16, 25, 36] * 3, + }, + index=np.random.rand(6 * 3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").rank().sort_index(), - pdf.groupby("b").rank().sort_index()) - self.assert_eq(kdf.groupby(['a', 'b']).rank().sort_index(), - pdf.groupby(['a', 'b']).rank().sort_index()) - self.assert_eq(kdf.groupby(['b'])['a'].rank().sort_index(), - pdf.groupby(['b'])['a'].rank().sort_index(), almost=True) - self.assert_eq(kdf.groupby(['b'])[['a', 'c']].rank().sort_index(), - pdf.groupby(['b'])[['a', 'c']].rank().sort_index(), almost=True) + self.assert_eq(kdf.groupby("b").rank().sort_index(), pdf.groupby("b").rank().sort_index()) + self.assert_eq( + kdf.groupby(["a", "b"]).rank().sort_index(), pdf.groupby(["a", "b"]).rank().sort_index() + ) + self.assert_eq( + kdf.groupby(["b"])["a"].rank().sort_index(), + pdf.groupby(["b"])["a"].rank().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby(["b"])[["a", "c"]].rank().sort_index(), + pdf.groupby(["b"])[["a", "c"]].rank().sort_index(), + almost=True, + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")).rank().sort_index(), - pdf.groupby(("x", "b")).rank().sort_index()) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]).rank().sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]).rank().sort_index()) + self.assert_eq( + kdf.groupby(("x", "b")).rank().sort_index(), pdf.groupby(("x", "b")).rank().sort_index() + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]).rank().sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).rank().sort_index(), + ) def test_cummin(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 3, - 'b': [1, 1, 2, 3, 5, 8] * 3, - 'c': [1, 4, 9, 16, 25, 36] * 3}, - index=np.random.rand(6 * 3)) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 3, + "b": [1, 1, 2, 3, 5, 8] * 3, + "c": [1, 4, 9, 16, 25, 36] * 3, + }, + index=np.random.rand(6 * 3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").cummin().sort_index(), - pdf.groupby("b").cummin().sort_index()) - self.assert_eq(kdf.groupby(['a', 'b']).cummin().sort_index(), - pdf.groupby(['a', 'b']).cummin().sort_index()) - self.assert_eq(kdf.groupby(['b'])['a'].cummin().sort_index(), - pdf.groupby(['b'])['a'].cummin().sort_index(), almost=True) - self.assert_eq(kdf.groupby(['b'])[['a', 'c']].cummin().sort_index(), - pdf.groupby(['b'])[['a', 'c']].cummin().sort_index(), almost=True) + self.assert_eq( + kdf.groupby("b").cummin().sort_index(), pdf.groupby("b").cummin().sort_index() + ) + self.assert_eq( + kdf.groupby(["a", "b"]).cummin().sort_index(), + pdf.groupby(["a", "b"]).cummin().sort_index(), + ) + self.assert_eq( + kdf.groupby(["b"])["a"].cummin().sort_index(), + pdf.groupby(["b"])["a"].cummin().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby(["b"])[["a", "c"]].cummin().sort_index(), + pdf.groupby(["b"])[["a", "c"]].cummin().sort_index(), + almost=True, + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")).cummin().sort_index(), - pdf.groupby(("x", "b")).cummin().sort_index()) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]).cummin().sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]).cummin().sort_index()) + self.assert_eq( + kdf.groupby(("x", "b")).cummin().sort_index(), + pdf.groupby(("x", "b")).cummin().sort_index(), + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]).cummin().sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).cummin().sort_index(), + ) def test_cummax(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 3, - 'b': [1, 1, 2, 3, 5, 8] * 3, - 'c': [1, 4, 9, 16, 25, 36] * 3}, - index=np.random.rand(6 * 3)) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 3, + "b": [1, 1, 2, 3, 5, 8] * 3, + "c": [1, 4, 9, 16, 25, 36] * 3, + }, + index=np.random.rand(6 * 3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").cummax().sort_index(), - pdf.groupby("b").cummax().sort_index()) - self.assert_eq(kdf.groupby(['a', 'b']).cummax().sort_index(), - pdf.groupby(['a', 'b']).cummax().sort_index()) - self.assert_eq(kdf.groupby(['b'])['a'].cummax().sort_index(), - pdf.groupby(['b'])['a'].cummax().sort_index(), almost=True) - self.assert_eq(kdf.groupby(['b'])[['a', 'c']].cummax().sort_index(), - pdf.groupby(['b'])[['a', 'c']].cummax().sort_index(), almost=True) + self.assert_eq( + kdf.groupby("b").cummax().sort_index(), pdf.groupby("b").cummax().sort_index() + ) + self.assert_eq( + kdf.groupby(["a", "b"]).cummax().sort_index(), + pdf.groupby(["a", "b"]).cummax().sort_index(), + ) + self.assert_eq( + kdf.groupby(["b"])["a"].cummax().sort_index(), + pdf.groupby(["b"])["a"].cummax().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby(["b"])[["a", "c"]].cummax().sort_index(), + pdf.groupby(["b"])[["a", "c"]].cummax().sort_index(), + almost=True, + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")).cummax().sort_index(), - pdf.groupby(("x", "b")).cummax().sort_index()) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]).cummax().sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]).cummax().sort_index()) + self.assert_eq( + kdf.groupby(("x", "b")).cummax().sort_index(), + pdf.groupby(("x", "b")).cummax().sort_index(), + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]).cummax().sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).cummax().sort_index(), + ) def test_cumsum(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 3, - 'b': [1, 1, 2, 3, 5, 8] * 3, - 'c': [1, 4, 9, 16, 25, 36] * 3}, - index=np.random.rand(6 * 3)) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 3, + "b": [1, 1, 2, 3, 5, 8] * 3, + "c": [1, 4, 9, 16, 25, 36] * 3, + }, + index=np.random.rand(6 * 3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").cumsum().sort_index(), - pdf.groupby("b").cumsum().sort_index()) - self.assert_eq(kdf.groupby(['a', 'b']).cumsum().sort_index(), - pdf.groupby(['a', 'b']).cumsum().sort_index()) - self.assert_eq(kdf.groupby(['b'])['a'].cumsum().sort_index(), - pdf.groupby(['b'])['a'].cumsum().sort_index(), almost=True) - self.assert_eq(kdf.groupby(['b'])[['a', 'c']].cumsum().sort_index(), - pdf.groupby(['b'])[['a', 'c']].cumsum().sort_index(), almost=True) + self.assert_eq( + kdf.groupby("b").cumsum().sort_index(), pdf.groupby("b").cumsum().sort_index() + ) + self.assert_eq( + kdf.groupby(["a", "b"]).cumsum().sort_index(), + pdf.groupby(["a", "b"]).cumsum().sort_index(), + ) + self.assert_eq( + kdf.groupby(["b"])["a"].cumsum().sort_index(), + pdf.groupby(["b"])["a"].cumsum().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby(["b"])[["a", "c"]].cumsum().sort_index(), + pdf.groupby(["b"])[["a", "c"]].cumsum().sort_index(), + almost=True, + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")).cumsum().sort_index(), - pdf.groupby(("x", "b")).cumsum().sort_index()) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]).cumsum().sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]).cumsum().sort_index()) + self.assert_eq( + kdf.groupby(("x", "b")).cumsum().sort_index(), + pdf.groupby(("x", "b")).cumsum().sort_index(), + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]).cumsum().sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).cumsum().sort_index(), + ) def test_cumprod(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 3, - 'b': [1, 1, 2, 3, 5, 8] * 3, - 'c': [1, 4, 9, 16, 25, 36] * 3}, - index=np.random.rand(6 * 3)) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 3, + "b": [1, 1, 2, 3, 5, 8] * 3, + "c": [1, 4, 9, 16, 25, 36] * 3, + }, + index=np.random.rand(6 * 3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").cumprod().sort_index(), - pdf.groupby("b").cumprod().sort_index(), almost=True) - self.assert_eq(kdf.groupby(['a', 'b']).cumprod().sort_index(), - pdf.groupby(['a', 'b']).cumprod().sort_index(), almost=True) - self.assert_eq(kdf.groupby(['b'])['a'].cumprod().sort_index(), - pdf.groupby(['b'])['a'].cumprod().sort_index(), almost=True) - self.assert_eq(kdf.groupby(['b'])[['a', 'c']].cumprod().sort_index(), - pdf.groupby(['b'])[['a', 'c']].cumprod().sort_index(), almost=True) + self.assert_eq( + kdf.groupby("b").cumprod().sort_index(), + pdf.groupby("b").cumprod().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby(["a", "b"]).cumprod().sort_index(), + pdf.groupby(["a", "b"]).cumprod().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby(["b"])["a"].cumprod().sort_index(), + pdf.groupby(["b"])["a"].cumprod().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby(["b"])[["a", "c"]].cumprod().sort_index(), + pdf.groupby(["b"])[["a", "c"]].cumprod().sort_index(), + almost=True, + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")).cumprod().sort_index(), - pdf.groupby(("x", "b")).cumprod().sort_index(), almost=True) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]).cumprod().sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]).cumprod().sort_index(), almost=True) + self.assert_eq( + kdf.groupby(("x", "b")).cumprod().sort_index(), + pdf.groupby(("x", "b")).cumprod().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]).cumprod().sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).cumprod().sort_index(), + almost=True, + ) def test_nsmallest(self): - pdf = pd.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, - 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - 'c': [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - 'd': [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3}, - index=np.random.rand(9 * 3)) + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, + "b": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + "c": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + "d": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + }, + index=np.random.rand(9 * 3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(repr(kdf.groupby(['a'])['b'].nsmallest(1).sort_values()), - repr(pdf.groupby(['a'])['b'].nsmallest(1).sort_values())) - self.assert_eq(repr(kdf.groupby(['a'])['b'].nsmallest(2).sort_index()), - repr(pdf.groupby(['a'])['b'].nsmallest(2).sort_index())) + self.assert_eq( + repr(kdf.groupby(["a"])["b"].nsmallest(1).sort_values()), + repr(pdf.groupby(["a"])["b"].nsmallest(1).sort_values()), + ) + self.assert_eq( + repr(kdf.groupby(["a"])["b"].nsmallest(2).sort_index()), + repr(pdf.groupby(["a"])["b"].nsmallest(2).sort_index()), + ) with self.assertRaisesRegex(ValueError, "nsmallest do not support multi-index now"): - kdf.set_index(['a', 'b']).groupby(['c'])['d'].nsmallest(1) + kdf.set_index(["a", "b"]).groupby(["c"])["d"].nsmallest(1) def test_nlargest(self): - pdf = pd.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, - 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - 'c': [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - 'd': [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3}, - index=np.random.rand(9 * 3)) + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, + "b": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + "c": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + "d": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + }, + index=np.random.rand(9 * 3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(repr(kdf.groupby(['a'])['b'].nlargest(1).sort_values()), - repr(pdf.groupby(['a'])['b'].nlargest(1).sort_values())) - self.assert_eq(repr(kdf.groupby(['a'])['b'].nlargest(2).sort_index()), - repr(pdf.groupby(['a'])['b'].nlargest(2).sort_index())) + self.assert_eq( + repr(kdf.groupby(["a"])["b"].nlargest(1).sort_values()), + repr(pdf.groupby(["a"])["b"].nlargest(1).sort_values()), + ) + self.assert_eq( + repr(kdf.groupby(["a"])["b"].nlargest(2).sort_index()), + repr(pdf.groupby(["a"])["b"].nlargest(2).sort_index()), + ) with self.assertRaisesRegex(ValueError, "nlargest do not support multi-index now"): - kdf.set_index(['a', 'b']).groupby(['c'])['d'].nlargest(1) + kdf.set_index(["a", "b"]).groupby(["c"])["d"].nlargest(1) def test_fillna(self): - pdf = pd.DataFrame({'A': [1, 1, 2, 2] * 3, - 'B': [2, 4, None, 3] * 3, - 'C': [None, None, None, 1] * 3, - 'D': [0, 1, 5, 4] * 3}) + pdf = pd.DataFrame( + { + "A": [1, 1, 2, 2] * 3, + "B": [2, 4, None, 3] * 3, + "C": [None, None, None, 1] * 3, + "D": [0, 1, 5, 4] * 3, + } + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("A").fillna(0).sort_index(), - pdf.groupby("A").fillna(0).sort_index()) - self.assert_eq(kdf.groupby("A").fillna(method='bfill').sort_index(), - pdf.groupby("A").fillna(method='bfill').sort_index()) - self.assert_eq(kdf.groupby("A").fillna(method='ffill').sort_index(), - pdf.groupby("A").fillna(method='ffill').sort_index()) + self.assert_eq( + kdf.groupby("A").fillna(0).sort_index(), pdf.groupby("A").fillna(0).sort_index() + ) + self.assert_eq( + kdf.groupby("A").fillna(method="bfill").sort_index(), + pdf.groupby("A").fillna(method="bfill").sort_index(), + ) + self.assert_eq( + kdf.groupby("A").fillna(method="ffill").sort_index(), + pdf.groupby("A").fillna(method="ffill").sort_index(), + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C'), ('Z', 'D')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Z", "D")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("X", "A")).fillna(0).sort_index(), - pdf.groupby(("X", "A")).fillna(0).sort_index()) - self.assert_eq(kdf.groupby(("X", "A")).fillna(method='bfill').sort_index(), - pdf.groupby(("X", "A")).fillna(method='bfill').sort_index()) - self.assert_eq(kdf.groupby(("X", "A")).fillna(method='ffill').sort_index(), - pdf.groupby(("X", "A")).fillna(method='ffill').sort_index()) + self.assert_eq( + kdf.groupby(("X", "A")).fillna(0).sort_index(), + pdf.groupby(("X", "A")).fillna(0).sort_index(), + ) + self.assert_eq( + kdf.groupby(("X", "A")).fillna(method="bfill").sort_index(), + pdf.groupby(("X", "A")).fillna(method="bfill").sort_index(), + ) + self.assert_eq( + kdf.groupby(("X", "A")).fillna(method="ffill").sort_index(), + pdf.groupby(("X", "A")).fillna(method="ffill").sort_index(), + ) def test_ffill(self): - pdf = pd.DataFrame({'A': [1, 1, 2, 2] * 3, - 'B': [2, 4, None, 3] * 3, - 'C': [None, None, None, 1] * 3, - 'D': [0, 1, 5, 4] * 3}, - index=np.random.rand(4 * 3)) + pdf = pd.DataFrame( + { + "A": [1, 1, 2, 2] * 3, + "B": [2, 4, None, 3] * 3, + "C": [None, None, None, 1] * 3, + "D": [0, 1, 5, 4] * 3, + }, + index=np.random.rand(4 * 3), + ) kdf = ks.from_pandas(pdf) if LooseVersion(pd.__version__) <= LooseVersion("0.24.2"): - self.assert_eq(kdf.groupby("A").ffill().sort_index(), - pdf.groupby("A").ffill().sort_index().drop('A', 1)) + self.assert_eq( + kdf.groupby("A").ffill().sort_index(), + pdf.groupby("A").ffill().sort_index().drop("A", 1), + ) else: - self.assert_eq(kdf.groupby("A").ffill().sort_index(), - pdf.groupby("A").ffill().sort_index()) - self.assert_eq(repr(kdf.groupby("A")['B'].ffill().sort_index()), - repr(pdf.groupby("A")['B'].ffill().sort_index())) + self.assert_eq( + kdf.groupby("A").ffill().sort_index(), pdf.groupby("A").ffill().sort_index() + ) + self.assert_eq( + repr(kdf.groupby("A")["B"].ffill().sort_index()), + repr(pdf.groupby("A")["B"].ffill().sort_index()), + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C'), ('Z', 'D')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Z", "D")]) pdf.columns = columns kdf.columns = columns if LooseVersion(pd.__version__) <= LooseVersion("0.24.2"): - self.assert_eq(kdf.groupby(("X", "A")).ffill().sort_index(), - pdf.groupby(("X", "A")).ffill().sort_index().drop(('X', 'A'), 1)) + self.assert_eq( + kdf.groupby(("X", "A")).ffill().sort_index(), + pdf.groupby(("X", "A")).ffill().sort_index().drop(("X", "A"), 1), + ) else: - self.assert_eq(kdf.groupby(("X", "A")).ffill().sort_index(), - pdf.groupby(("X", "A")).ffill().sort_index()) + self.assert_eq( + kdf.groupby(("X", "A")).ffill().sort_index(), + pdf.groupby(("X", "A")).ffill().sort_index(), + ) def test_bfill(self): - pdf = pd.DataFrame({'A': [1, 1, 2, 2] * 3, - 'B': [2, 4, None, 3] * 3, - 'C': [None, None, None, 1] * 3, - 'D': [0, 1, 5, 4] * 3}, - index=np.random.rand(4 * 3)) + pdf = pd.DataFrame( + { + "A": [1, 1, 2, 2] * 3, + "B": [2, 4, None, 3] * 3, + "C": [None, None, None, 1] * 3, + "D": [0, 1, 5, 4] * 3, + }, + index=np.random.rand(4 * 3), + ) kdf = ks.from_pandas(pdf) if LooseVersion(pd.__version__) <= LooseVersion("0.24.2"): - self.assert_eq(kdf.groupby("A").bfill().sort_index(), - pdf.groupby("A").bfill().sort_index().drop('A', 1)) + self.assert_eq( + kdf.groupby("A").bfill().sort_index(), + pdf.groupby("A").bfill().sort_index().drop("A", 1), + ) else: - self.assert_eq(kdf.groupby("A").bfill().sort_index(), - pdf.groupby("A").bfill().sort_index()) - self.assert_eq(repr(kdf.groupby("A")['B'].bfill().sort_index()), - repr(pdf.groupby("A")['B'].bfill().sort_index())) + self.assert_eq( + kdf.groupby("A").bfill().sort_index(), pdf.groupby("A").bfill().sort_index() + ) + self.assert_eq( + repr(kdf.groupby("A")["B"].bfill().sort_index()), + repr(pdf.groupby("A")["B"].bfill().sort_index()), + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C'), ('Z', 'D')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Z", "D")]) pdf.columns = columns kdf.columns = columns if LooseVersion(pd.__version__) <= LooseVersion("0.24.2"): - self.assert_eq(kdf.groupby(("X", "A")).bfill().sort_index(), - pdf.groupby(("X", "A")).bfill().sort_index().drop(('X', 'A'), 1)) + self.assert_eq( + kdf.groupby(("X", "A")).bfill().sort_index(), + pdf.groupby(("X", "A")).bfill().sort_index().drop(("X", "A"), 1), + ) else: - self.assert_eq(kdf.groupby(("X", "A")).bfill().sort_index(), - pdf.groupby(("X", "A")).bfill().sort_index()) + self.assert_eq( + kdf.groupby(("X", "A")).bfill().sort_index(), + pdf.groupby(("X", "A")).bfill().sort_index(), + ) - @unittest.skipIf(pd.__version__ < '0.24.0', "not supported before pandas 0.24.0") + @unittest.skipIf(pd.__version__ < "0.24.0", "not supported before pandas 0.24.0") def test_shift(self): - pdf = pd.DataFrame({'a': [1, 1, 2, 2, 3, 3] * 3, - 'b': [1, 1, 2, 2, 3, 4] * 3, - 'c': [1, 4, 9, 16, 25, 36] * 3}, - index=np.random.rand(6 * 3)) + pdf = pd.DataFrame( + { + "a": [1, 1, 2, 2, 3, 3] * 3, + "b": [1, 1, 2, 2, 3, 4] * 3, + "c": [1, 4, 9, 16, 25, 36] * 3, + }, + index=np.random.rand(6 * 3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby('a').shift().sort_index(), - pdf.groupby('a').shift().sort_index()) + self.assert_eq(kdf.groupby("a").shift().sort_index(), pdf.groupby("a").shift().sort_index()) # TODO: seems like a pandas' bug when fill_value is not None? # self.assert_eq(kdf.groupby(['a', 'b']).shift(periods=-1, fill_value=0).sort_index(), # pdf.groupby(['a', 'b']).shift(periods=-1, fill_value=0).sort_index()) - self.assert_eq(kdf.groupby(['b'])['a'].shift().sort_index(), - pdf.groupby(['b'])['a'].shift().sort_index(), almost=True) - self.assert_eq(kdf.groupby(['a', 'b'])['c'].shift().sort_index(), - pdf.groupby(['a', 'b'])['c'].shift().sort_index(), almost=True) + self.assert_eq( + kdf.groupby(["b"])["a"].shift().sort_index(), + pdf.groupby(["b"])["a"].shift().sort_index(), + almost=True, + ) + self.assert_eq( + kdf.groupby(["a", "b"])["c"].shift().sort_index(), + pdf.groupby(["a", "b"])["c"].shift().sort_index(), + almost=True, + ) # TODO: seems like a pandas' bug when fill_value is not None when only pandas>=1.0.0 if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): self.assert_eq( - kdf.groupby(['b'])[['a', 'c']].shift(periods=-1, fill_value=0).sort_index(), - pdf.groupby(['b'])[['a', 'c']].shift(periods=-1, fill_value=0).sort_index(), - almost=True) + kdf.groupby(["b"])[["a", "c"]].shift(periods=-1, fill_value=0).sort_index(), + pdf.groupby(["b"])[["a", "c"]].shift(periods=-1, fill_value=0).sort_index(), + almost=True, + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(('x', 'a')).shift().sort_index(), - pdf.groupby(('x', 'a')).shift().sort_index()) + self.assert_eq( + kdf.groupby(("x", "a")).shift().sort_index(), + pdf.groupby(("x", "a")).shift().sort_index(), + ) # TODO: seems like a pandas' bug when fill_value is not None? # self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]).shift(periods=-1, # fill_value=0).sort_index(), @@ -768,328 +1053,439 @@ def test_shift(self): # fill_value=0).sort_index()) def test_apply(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], - 'b': [1, 1, 2, 3, 5, 8], - 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c']) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 2, 3, 5, 8], "c": [1, 4, 9, 16, 25, 36]}, + columns=["a", "b", "c"], + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").apply(lambda x: x + 1).sort_index(), - pdf.groupby("b").apply(lambda x: x + 1).sort_index()) - self.assert_eq(kdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index(), - pdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index()) - self.assert_eq(kdf.groupby(['b'])['c'].apply(lambda x: x).sort_index(), - pdf.groupby(['b'])['c'].apply(lambda x: x).sort_index()) + self.assert_eq( + kdf.groupby("b").apply(lambda x: x + 1).sort_index(), + pdf.groupby("b").apply(lambda x: x + 1).sort_index(), + ) + self.assert_eq( + kdf.groupby(["a", "b"]).apply(lambda x: x * x).sort_index(), + pdf.groupby(["a", "b"]).apply(lambda x: x * x).sort_index(), + ) + self.assert_eq( + kdf.groupby(["b"])["c"].apply(lambda x: x).sort_index(), + pdf.groupby(["b"])["c"].apply(lambda x: x).sort_index(), + ) with self.assertRaisesRegex(TypeError, " object is not callable"): kdf.groupby("b").apply(1) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")).apply(lambda x: x + 1).sort_index(), - pdf.groupby(("x", "b")).apply(lambda x: x + 1).sort_index()) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]).apply(lambda x: x * x).sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]).apply(lambda x: x * x).sort_index()) + self.assert_eq( + kdf.groupby(("x", "b")).apply(lambda x: x + 1).sort_index(), + pdf.groupby(("x", "b")).apply(lambda x: x + 1).sort_index(), + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]).apply(lambda x: x * x).sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).apply(lambda x: x * x).sort_index(), + ) def test_apply_with_new_dataframe(self): - pdf = pd.DataFrame({ - "timestamp": [0.0, 0.5, 1.0, 0.0, 0.5], - "car_id": ['A', 'A', 'A', 'B', 'B'] - }) + pdf = pd.DataFrame( + {"timestamp": [0.0, 0.5, 1.0, 0.0, 0.5], "car_id": ["A", "A", "A", "B", "B"]} + ) kdf = ks.from_pandas(pdf) self.assert_eq( - kdf.groupby('car_id').apply(lambda _: pd.DataFrame({"column": [0.0]})).sort_index(), - pdf.groupby('car_id').apply(lambda _: pd.DataFrame({"column": [0.0]})).sort_index()) + kdf.groupby("car_id").apply(lambda _: pd.DataFrame({"column": [0.0]})).sort_index(), + pdf.groupby("car_id").apply(lambda _: pd.DataFrame({"column": [0.0]})).sort_index(), + ) self.assert_eq( - kdf.groupby('car_id') - .apply(lambda df: pd.DataFrame({'mean': [df['timestamp'].mean()]})).sort_index(), - pdf.groupby('car_id') - .apply(lambda df: pd.DataFrame({"mean": [df['timestamp'].mean()]})).sort_index()) + kdf.groupby("car_id") + .apply(lambda df: pd.DataFrame({"mean": [df["timestamp"].mean()]})) + .sort_index(), + pdf.groupby("car_id") + .apply(lambda df: pd.DataFrame({"mean": [df["timestamp"].mean()]})) + .sort_index(), + ) # dataframe with 1000+ records - pdf = pd.DataFrame({ - "timestamp": [0.0, 0.5, 1.0, 0.0, 0.5] * 300, - "car_id": ['A', 'A', 'A', 'B', 'B'] * 300 - }) + pdf = pd.DataFrame( + { + "timestamp": [0.0, 0.5, 1.0, 0.0, 0.5] * 300, + "car_id": ["A", "A", "A", "B", "B"] * 300, + } + ) kdf = ks.from_pandas(pdf) self.assert_eq( - kdf.groupby('car_id').apply(lambda _: pd.DataFrame({"column": [0.0]})).sort_index(), - pdf.groupby('car_id').apply(lambda _: pd.DataFrame({"column": [0.0]})).sort_index()) + kdf.groupby("car_id").apply(lambda _: pd.DataFrame({"column": [0.0]})).sort_index(), + pdf.groupby("car_id").apply(lambda _: pd.DataFrame({"column": [0.0]})).sort_index(), + ) self.assert_eq( - kdf.groupby('car_id') - .apply(lambda df: pd.DataFrame({"mean": [df['timestamp'].mean()]})).sort_index(), - pdf.groupby('car_id') - .apply(lambda df: pd.DataFrame({"mean": [df['timestamp'].mean()]})).sort_index()) + kdf.groupby("car_id") + .apply(lambda df: pd.DataFrame({"mean": [df["timestamp"].mean()]})) + .sort_index(), + pdf.groupby("car_id") + .apply(lambda df: pd.DataFrame({"mean": [df["timestamp"].mean()]})) + .sort_index(), + ) def test_transform(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], - 'b': [1, 1, 2, 3, 5, 8], - 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c']) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 2, 3, 5, 8], "c": [1, 4, 9, 16, 25, 36]}, + columns=["a", "b", "c"], + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").transform(lambda x: x + 1).sort_index(), - pdf.groupby("b").transform(lambda x: x + 1).sort_index()) - self.assert_eq(kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(), - pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index()) - self.assert_eq(kdf.groupby(['b'])['c'].transform(lambda x: x).sort_index(), - pdf.groupby(['b'])['c'].transform(lambda x: x).sort_index()) + self.assert_eq( + kdf.groupby("b").transform(lambda x: x + 1).sort_index(), + pdf.groupby("b").transform(lambda x: x + 1).sort_index(), + ) + self.assert_eq( + kdf.groupby(["a", "b"]).transform(lambda x: x * x).sort_index(), + pdf.groupby(["a", "b"]).transform(lambda x: x * x).sort_index(), + ) + self.assert_eq( + kdf.groupby(["b"])["c"].transform(lambda x: x).sort_index(), + pdf.groupby(["b"])["c"].transform(lambda x: x).sort_index(), + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(), - pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index()) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]) - .transform(lambda x: x * x).sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]) - .transform(lambda x: x * x).sort_index()) - - with option_context('compute.shortcut_limit', 1000): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 300, - 'b': [1, 1, 2, 3, 5, 8] * 300, - 'c': [1, 4, 9, 16, 25, 36] * 300}, columns=['a', 'b', 'c']) + self.assert_eq( + kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(), + pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(), + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]).transform(lambda x: x * x).sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).transform(lambda x: x * x).sort_index(), + ) + + with option_context("compute.shortcut_limit", 1000): + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 300, + "b": [1, 1, 2, 3, 5, 8] * 300, + "c": [1, 4, 9, 16, 25, 36] * 300, + }, + columns=["a", "b", "c"], + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").transform(lambda x: x + 1).sort_index(), - pdf.groupby("b").transform(lambda x: x + 1).sort_index()) - self.assert_eq(kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(), - pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index()) - self.assert_eq(kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(), - pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index()) + self.assert_eq( + kdf.groupby("b").transform(lambda x: x + 1).sort_index(), + pdf.groupby("b").transform(lambda x: x + 1).sort_index(), + ) + self.assert_eq( + kdf.groupby(["a", "b"]).transform(lambda x: x * x).sort_index(), + pdf.groupby(["a", "b"]).transform(lambda x: x * x).sort_index(), + ) + self.assert_eq( + kdf.groupby(["b"])["a"].transform(lambda x: x).sort_index(), + pdf.groupby(["b"])["a"].transform(lambda x: x).sort_index(), + ) with self.assertRaisesRegex(TypeError, " object is not callable"): kdf.groupby("b").transform(1) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(), - pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index()) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]) - .transform(lambda x: x * x).sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]) - .transform(lambda x: x * x).sort_index()) + self.assert_eq( + kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(), + pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(), + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]).transform(lambda x: x * x).sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).transform(lambda x: x * x).sort_index(), + ) def test_filter(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], - 'b': [1, 1, 2, 3, 5, 8], - 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c']) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 2, 3, 5, 8], "c": [1, 4, 9, 16, 25, 36]}, + columns=["a", "b", "c"], + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.groupby("b").filter(lambda x: x.b.mean() < 4).sort_index(), - pdf.groupby("b").filter(lambda x: x.b.mean() < 4).sort_index()) - self.assert_eq(kdf.groupby(['a', 'b']).filter(lambda x: any(x.a == 2)).sort_index(), - pdf.groupby(['a', 'b']).filter(lambda x: any(x.a == 2)).sort_index()) + self.assert_eq( + kdf.groupby("b").filter(lambda x: x.b.mean() < 4).sort_index(), + pdf.groupby("b").filter(lambda x: x.b.mean() < 4).sort_index(), + ) + self.assert_eq( + kdf.groupby(["a", "b"]).filter(lambda x: any(x.a == 2)).sort_index(), + pdf.groupby(["a", "b"]).filter(lambda x: any(x.a == 2)).sort_index(), + ) with self.assertRaisesRegex(TypeError, " object is not callable"): kdf.groupby("b").filter(1) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(kdf.groupby(("x", "b")) - .filter(lambda x: x[('x', 'b')].mean() < 4).sort_index(), - pdf.groupby(("x", "b")) - .filter(lambda x: x[('x', 'b')].mean() < 4).sort_index()) - self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]) - .filter(lambda x: any(x[('x', 'a')] == 2)).sort_index(), - pdf.groupby([('x', 'a'), ('x', 'b')]) - .filter(lambda x: any(x[('x', 'a')] == 2)).sort_index()) + self.assert_eq( + kdf.groupby(("x", "b")).filter(lambda x: x[("x", "b")].mean() < 4).sort_index(), + pdf.groupby(("x", "b")).filter(lambda x: x[("x", "b")].mean() < 4).sort_index(), + ) + self.assert_eq( + kdf.groupby([("x", "a"), ("x", "b")]) + .filter(lambda x: any(x[("x", "a")] == 2)) + .sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]) + .filter(lambda x: any(x[("x", "a")] == 2)) + .sort_index(), + ) def test_idxmax(self): - pdf = pd.DataFrame({'a': [1, 1, 2, 2, 3] * 3, - 'b': [1, 2, 3, 4, 5] * 3, - 'c': [5, 4, 3, 2, 1] * 3}) + pdf = pd.DataFrame( + {"a": [1, 1, 2, 2, 3] * 3, "b": [1, 2, 3, 4, 5] * 3, "c": [5, 4, 3, 2, 1] * 3} + ) kdf = ks.from_pandas(pdf) - self.assert_eq(pdf.groupby(['a']).idxmax().sort_index(), - kdf.groupby(['a']).idxmax().sort_index()) - self.assert_eq(pdf.groupby(['a']).idxmax(skipna=False).sort_index(), - kdf.groupby(['a']).idxmax(skipna=False).sort_index()) + self.assert_eq( + pdf.groupby(["a"]).idxmax().sort_index(), kdf.groupby(["a"]).idxmax().sort_index() + ) + self.assert_eq( + pdf.groupby(["a"]).idxmax(skipna=False).sort_index(), + kdf.groupby(["a"]).idxmax(skipna=False).sort_index(), + ) - with self.assertRaisesRegex(ValueError, 'idxmax only support one-level index now'): - kdf.set_index(['a', 'b']).groupby(['c']).idxmax() + with self.assertRaisesRegex(ValueError, "idxmax only support one-level index now"): + kdf.set_index(["a", "b"]).groupby(["c"]).idxmax() # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(pdf.groupby(('x', 'a')).idxmax().sort_index(), - kdf.groupby(('x', 'a')).idxmax().sort_index()) - self.assert_eq(pdf.groupby(('x', 'a')).idxmax(skipna=False).sort_index(), - kdf.groupby(('x', 'a')).idxmax(skipna=False).sort_index()) + self.assert_eq( + pdf.groupby(("x", "a")).idxmax().sort_index(), + kdf.groupby(("x", "a")).idxmax().sort_index(), + ) + self.assert_eq( + pdf.groupby(("x", "a")).idxmax(skipna=False).sort_index(), + kdf.groupby(("x", "a")).idxmax(skipna=False).sort_index(), + ) def test_idxmin(self): - pdf = pd.DataFrame({'a': [1, 1, 2, 2, 3] * 3, - 'b': [1, 2, 3, 4, 5] * 3, - 'c': [5, 4, 3, 2, 1] * 3}) + pdf = pd.DataFrame( + {"a": [1, 1, 2, 2, 3] * 3, "b": [1, 2, 3, 4, 5] * 3, "c": [5, 4, 3, 2, 1] * 3} + ) kdf = ks.from_pandas(pdf) - self.assert_eq(pdf.groupby(['a']).idxmin().sort_index(), - kdf.groupby(['a']).idxmin().sort_index()) - self.assert_eq(pdf.groupby(['a']).idxmin(skipna=False).sort_index(), - kdf.groupby(['a']).idxmin(skipna=False).sort_index()) + self.assert_eq( + pdf.groupby(["a"]).idxmin().sort_index(), kdf.groupby(["a"]).idxmin().sort_index() + ) + self.assert_eq( + pdf.groupby(["a"]).idxmin(skipna=False).sort_index(), + kdf.groupby(["a"]).idxmin(skipna=False).sort_index(), + ) - with self.assertRaisesRegex(ValueError, 'idxmin only support one-level index now'): - kdf.set_index(['a', 'b']).groupby(['c']).idxmin() + with self.assertRaisesRegex(ValueError, "idxmin only support one-level index now"): + kdf.set_index(["a", "b"]).groupby(["c"]).idxmin() # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(pdf.groupby(('x', 'a')).idxmin().sort_index(), - kdf.groupby(('x', 'a')).idxmin().sort_index()) - self.assert_eq(pdf.groupby(('x', 'a')).idxmin(skipna=False).sort_index(), - kdf.groupby(('x', 'a')).idxmin(skipna=False).sort_index()) + self.assert_eq( + pdf.groupby(("x", "a")).idxmin().sort_index(), + kdf.groupby(("x", "a")).idxmin().sort_index(), + ) + self.assert_eq( + pdf.groupby(("x", "a")).idxmin(skipna=False).sort_index(), + kdf.groupby(("x", "a")).idxmin(skipna=False).sort_index(), + ) def test_head(self): - pdf = pd.DataFrame({'a': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, - 'b': [2, 3, 1, 4, 6, 9, 8, 10, 7, 5] * 3, - 'c': [3, 5, 2, 5, 1, 2, 6, 4, 3, 6] * 3}, - index=np.random.rand(10 * 3)) + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, + "b": [2, 3, 1, 4, 6, 9, 8, 10, 7, 5] * 3, + "c": [3, 5, 2, 5, 1, 2, 6, 4, 3, 6] * 3, + }, + index=np.random.rand(10 * 3), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(pdf.groupby('a').head(2).sort_index(), - kdf.groupby('a').head(2).sort_index()) - self.assert_eq(pdf.groupby('a').head(-2).sort_index(), - kdf.groupby('a').head(-2).sort_index()) - self.assert_eq(pdf.groupby('a').head(100000).sort_index(), - kdf.groupby('a').head(100000).sort_index()) + self.assert_eq(pdf.groupby("a").head(2).sort_index(), kdf.groupby("a").head(2).sort_index()) + self.assert_eq( + pdf.groupby("a").head(-2).sort_index(), kdf.groupby("a").head(-2).sort_index() + ) + self.assert_eq( + pdf.groupby("a").head(100000).sort_index(), kdf.groupby("a").head(100000).sort_index() + ) - self.assert_eq(pdf.groupby('a')['b'].head(2).sort_index(), - kdf.groupby('a')['b'].head(2).sort_index()) - self.assert_eq(pdf.groupby('a')['b'].head(-2).sort_index(), - kdf.groupby('a')['b'].head(-2).sort_index()) - self.assert_eq(pdf.groupby('a')['b'].head(100000).sort_index(), - kdf.groupby('a')['b'].head(100000).sort_index()) + self.assert_eq( + pdf.groupby("a")["b"].head(2).sort_index(), kdf.groupby("a")["b"].head(2).sort_index() + ) + self.assert_eq( + pdf.groupby("a")["b"].head(-2).sort_index(), kdf.groupby("a")["b"].head(-2).sort_index() + ) + self.assert_eq( + pdf.groupby("a")["b"].head(100000).sort_index(), + kdf.groupby("a")["b"].head(100000).sort_index(), + ) # multi-index - midx = pd.MultiIndex([['x', 'y'], - ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']], - [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1], - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]) - pdf = pd.DataFrame({'a': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3], - 'b': [2, 3, 1, 4, 6, 9, 8, 10, 7, 5], - 'c': [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]}, - columns=['a', 'b', 'c'], - index=midx) + midx = pd.MultiIndex( + [["x", "y"], ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]], + [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], + ) + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 1, 2, 2, 2, 3, 3, 3], + "b": [2, 3, 1, 4, 6, 9, 8, 10, 7, 5], + "c": [3, 5, 2, 5, 1, 2, 6, 4, 3, 6], + }, + columns=["a", "b", "c"], + index=midx, + ) kdf = ks.from_pandas(pdf) - self.assert_eq(pdf.groupby('a').head(2).sort_index(), - kdf.groupby('a').head(2).sort_index()) - self.assert_eq(pdf.groupby('a').head(-2).sort_index(), - kdf.groupby('a').head(-2).sort_index()) - self.assert_eq(pdf.groupby('a').head(100000).sort_index(), - kdf.groupby('a').head(100000).sort_index()) + self.assert_eq(pdf.groupby("a").head(2).sort_index(), kdf.groupby("a").head(2).sort_index()) + self.assert_eq( + pdf.groupby("a").head(-2).sort_index(), kdf.groupby("a").head(-2).sort_index() + ) + self.assert_eq( + pdf.groupby("a").head(100000).sort_index(), kdf.groupby("a").head(100000).sort_index() + ) - self.assert_eq(pdf.groupby('a')['b'].head(2).sort_index(), - kdf.groupby('a')['b'].head(2).sort_index()) - self.assert_eq(pdf.groupby('a')['b'].head(-2).sort_index(), - kdf.groupby('a')['b'].head(-2).sort_index()) - self.assert_eq(pdf.groupby('a')['b'].head(100000).sort_index(), - kdf.groupby('a')['b'].head(100000).sort_index()) + self.assert_eq( + pdf.groupby("a")["b"].head(2).sort_index(), kdf.groupby("a")["b"].head(2).sort_index() + ) + self.assert_eq( + pdf.groupby("a")["b"].head(-2).sort_index(), kdf.groupby("a")["b"].head(-2).sort_index() + ) + self.assert_eq( + pdf.groupby("a")["b"].head(100000).sort_index(), + kdf.groupby("a")["b"].head(100000).sort_index(), + ) # multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) pdf.columns = columns kdf.columns = columns - self.assert_eq(pdf.groupby(('x', 'a')).head(2).sort_index(), - kdf.groupby(('x', 'a')).head(2).sort_index()) - self.assert_eq(pdf.groupby(('x', 'a')).head(-2).sort_index(), - kdf.groupby(('x', 'a')).head(-2).sort_index()) - self.assert_eq(pdf.groupby(('x', 'a')).head(100000).sort_index(), - kdf.groupby(('x', 'a')).head(100000).sort_index()) + self.assert_eq( + pdf.groupby(("x", "a")).head(2).sort_index(), + kdf.groupby(("x", "a")).head(2).sort_index(), + ) + self.assert_eq( + pdf.groupby(("x", "a")).head(-2).sort_index(), + kdf.groupby(("x", "a")).head(-2).sort_index(), + ) + self.assert_eq( + pdf.groupby(("x", "a")).head(100000).sort_index(), + kdf.groupby(("x", "a")).head(100000).sort_index(), + ) def test_missing(self): - kdf = ks.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + kdf = ks.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) # DataFrameGroupBy functions - missing_functions = inspect.getmembers(_MissingPandasLikeDataFrameGroupBy, - inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + missing_functions = inspect.getmembers( + _MissingPandasLikeDataFrameGroupBy, inspect.isfunction + ) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name)): - getattr(kdf.groupby('a'), name)() - - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + PandasNotImplementedError, + "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(kdf.groupby("a"), name)() + + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*GroupBy.*{}.*is deprecated" - .format(name)): - getattr(kdf.groupby('a'), name)() + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*GroupBy.*{}.*is deprecated".format(name) + ): + getattr(kdf.groupby("a"), name)() # SeriesGroupBy functions - missing_functions = inspect.getmembers(_MissingPandasLikeSeriesGroupBy, - inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + missing_functions = inspect.getmembers(_MissingPandasLikeSeriesGroupBy, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.groupby(kdf.a), name)() - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*GroupBy.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*GroupBy.*{}.*is deprecated".format(name) + ): getattr(kdf.a.groupby(kdf.a), name)() # DataFrameGroupBy properties - missing_properties = inspect.getmembers(_MissingPandasLikeDataFrameGroupBy, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeDataFrameGroupBy, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name)): - getattr(kdf.groupby('a'), name) - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + PandasNotImplementedError, + "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(kdf.groupby("a"), name) + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*GroupBy.*{}.*is deprecated" - .format(name)): - getattr(kdf.groupby('a'), name) + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*GroupBy.*{}.*is deprecated".format(name) + ): + getattr(kdf.groupby("a"), name) # SeriesGroupBy properties - missing_properties = inspect.getmembers(_MissingPandasLikeSeriesGroupBy, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeSeriesGroupBy, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.groupby(kdf.a), name) - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*GroupBy.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*GroupBy.*{}.*is deprecated".format(name) + ): getattr(kdf.a.groupby(kdf.a), name) @staticmethod def test_is_multi_agg_with_relabel(): - assert _is_multi_agg_with_relabel(a='max') is False - assert _is_multi_agg_with_relabel(a_min=('a', 'max'), a_max=('a', 'min')) is True + assert _is_multi_agg_with_relabel(a="max") is False + assert _is_multi_agg_with_relabel(a_min=("a", "max"), a_max=("a", "min")) is True diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 6703d20..74f0a83 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -28,44 +28,45 @@ class IndexesTest(ReusedSQLTestCase, TestUtils): - @property def pdf(self): - return pd.DataFrame({ - 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], - 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0], - }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0],}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) @property def kdf(self): return ks.from_pandas(self.pdf) def test_index(self): - for pdf in [pd.DataFrame(np.random.randn(10, 5), index=list('abcdefghij')), - pd.DataFrame(np.random.randn(10, 5), - index=pd.date_range('2011-01-01', freq='D', periods=10)), - pd.DataFrame(np.random.randn(10, 5), - columns=list('abcde')).set_index(['a', 'b'])]: + for pdf in [ + pd.DataFrame(np.random.randn(10, 5), index=list("abcdefghij")), + pd.DataFrame( + np.random.randn(10, 5), index=pd.date_range("2011-01-01", freq="D", periods=10) + ), + pd.DataFrame(np.random.randn(10, 5), columns=list("abcde")).set_index(["a", "b"]), + ]: kdf = ks.from_pandas(pdf) self.assert_eq(kdf.index, pdf.index) def test_index_getattr(self): kidx = self.kdf.index - item = 'databricks' + item = "databricks" - expected_error_message = ("'Index' object has no attribute '{}'".format(item)) + expected_error_message = "'Index' object has no attribute '{}'".format(item) with self.assertRaisesRegex(AttributeError, expected_error_message): kidx.__getattr__(item) def test_multi_index_getattr(self): - arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) kdf = ks.from_pandas(pdf) kidx = kdf.index - item = 'databricks' + item = "databricks" - expected_error_message = ("'MultiIndex' object has no attribute '{}'".format(item)) + expected_error_message = "'MultiIndex' object has no attribute '{}'".format(item) with self.assertRaisesRegex(AttributeError, expected_error_message): kidx.__getattr__(item) @@ -74,17 +75,17 @@ def test_to_series(self): kidx = self.kdf.index self.assert_eq(kidx.to_series(), pidx.to_series()) - self.assert_eq(kidx.to_series(name='a'), pidx.to_series(name='a')) + self.assert_eq(kidx.to_series(name="a"), pidx.to_series(name="a")) # FIXME: the index values are not addressed the change. (#1190) # self.assert_eq((kidx + 1).to_series(), (pidx + 1).to_series()) - pidx = self.pdf.set_index('b', append=True).index - kidx = self.kdf.set_index('b', append=True).index + pidx = self.pdf.set_index("b", append=True).index + kidx = self.kdf.set_index("b", append=True).index - with self.sql_conf({'spark.sql.execution.arrow.enabled': False}): + with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): self.assert_eq(kidx.to_series(), pidx.to_series()) - self.assert_eq(kidx.to_series(name='a'), pidx.to_series(name='a')) + self.assert_eq(kidx.to_series(name="a"), pidx.to_series(name="a")) def test_to_frame(self): pidx = self.pdf.index @@ -93,36 +94,41 @@ def test_to_frame(self): self.assert_eq(repr(kidx.to_frame()), repr(pidx.to_frame())) self.assert_eq(repr(kidx.to_frame(index=False)), repr(pidx.to_frame(index=False))) - pidx.name = 'a' - kidx.name = 'a' + pidx.name = "a" + kidx.name = "a" self.assert_eq(repr(kidx.to_frame()), repr(pidx.to_frame())) self.assert_eq(repr(kidx.to_frame(index=False)), repr(pidx.to_frame(index=False))) - if LooseVersion(pd.__version__) >= LooseVersion('0.24'): + if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `name` argument is added in pandas 0.24. - self.assert_eq(repr(kidx.to_frame(name='x')), repr(pidx.to_frame(name='x'))) - self.assert_eq(repr(kidx.to_frame(index=False, name='x')), - repr(pidx.to_frame(index=False, name='x'))) + self.assert_eq(repr(kidx.to_frame(name="x")), repr(pidx.to_frame(name="x"))) + self.assert_eq( + repr(kidx.to_frame(index=False, name="x")), + repr(pidx.to_frame(index=False, name="x")), + ) - pidx = self.pdf.set_index('b', append=True).index - kidx = self.kdf.set_index('b', append=True).index + pidx = self.pdf.set_index("b", append=True).index + kidx = self.kdf.set_index("b", append=True).index self.assert_eq(repr(kidx.to_frame()), repr(pidx.to_frame())) self.assert_eq(repr(kidx.to_frame(index=False)), repr(pidx.to_frame(index=False))) - if LooseVersion(pd.__version__) >= LooseVersion('0.24'): + if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `name` argument is added in pandas 0.24. - self.assert_eq(repr(kidx.to_frame(name=['x', 'y'])), - repr(pidx.to_frame(name=['x', 'y']))) - self.assert_eq(repr(kidx.to_frame(index=False, name=['x', 'y'])), - repr(pidx.to_frame(index=False, name=['x', 'y']))) + self.assert_eq( + repr(kidx.to_frame(name=["x", "y"])), repr(pidx.to_frame(name=["x", "y"])) + ) + self.assert_eq( + repr(kidx.to_frame(index=False, name=["x", "y"])), + repr(pidx.to_frame(index=False, name=["x", "y"])), + ) def test_index_names(self): kdf = self.kdf self.assertIsNone(kdf.index.name) - idx = pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name='x') + idx = pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x") pdf = pd.DataFrame(np.random.randn(10, 5), idx) kdf = ks.from_pandas(pdf) @@ -131,8 +137,8 @@ def test_index_names(self): pidx = pdf.index kidx = kdf.index - pidx.name = 'renamed' - kidx.name = 'renamed' + pidx.name = "renamed" + kidx.name = "renamed" self.assertEqual(kidx.name, pidx.name) self.assertEqual(kidx.names, pidx.names) self.assert_eq(kidx, pidx) @@ -144,16 +150,17 @@ def test_index_names(self): self.assert_eq(kidx, pidx) with self.assertRaisesRegex(ValueError, "Names must be a list-like"): - kidx.names = 'hi' + kidx.names = "hi" - expected_error_message = ("Length of new names must be {}, got {}" - .format(len(kdf._internal.index_map), len(['0', '1']))) + expected_error_message = "Length of new names must be {}, got {}".format( + len(kdf._internal.index_map), len(["0", "1"]) + ) with self.assertRaisesRegex(ValueError, expected_error_message): - kidx.names = ['0', '1'] + kidx.names = ["0", "1"] def test_multi_index_names(self): - arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) kdf = ks.from_pandas(pdf) @@ -161,16 +168,16 @@ def test_multi_index_names(self): pidx = pdf.index kidx = kdf.index - pidx.names = ['renamed_number', 'renamed_color'] - kidx.names = ['renamed_number', 'renamed_color'] + pidx.names = ["renamed_number", "renamed_color"] + kidx.names = ["renamed_number", "renamed_color"] self.assertEqual(kidx.names, pidx.names) - pidx.names = ['renamed_number', None] - kidx.names = ['renamed_number', None] + pidx.names = ["renamed_number", None] + kidx.names = ["renamed_number", None] self.assertEqual(kidx.names, pidx.names) - if LooseVersion(pyspark.__version__) < LooseVersion('2.4'): + if LooseVersion(pyspark.__version__) < LooseVersion("2.4"): # PySpark < 2.4 does not support struct type with arrow enabled. - with self.sql_conf({'spark.sql.execution.arrow.enabled': False}): + with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): self.assert_eq(kidx, pidx) else: self.assert_eq(kidx, pidx) @@ -178,21 +185,22 @@ def test_multi_index_names(self): with self.assertRaises(PandasNotImplementedError): kidx.name with self.assertRaises(PandasNotImplementedError): - kidx.name = 'renamed' + kidx.name = "renamed" def test_index_rename(self): - pdf = pd.DataFrame(np.random.randn(10, 5), - index=pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name='x')) + pdf = pd.DataFrame( + np.random.randn(10, 5), index=pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x") + ) kdf = ks.from_pandas(pdf) pidx = pdf.index kidx = kdf.index - self.assert_eq(kidx.rename('y'), pidx.rename('y')) + self.assert_eq(kidx.rename("y"), pidx.rename("y")) self.assert_eq(kdf.index.names, pdf.index.names) - kidx.rename('z', inplace=True) - pidx.rename('z', inplace=True) + kidx.rename("z", inplace=True) + pidx.rename("z", inplace=True) self.assert_eq(kidx, pidx) self.assert_eq(kdf.index.names, pdf.index.names) @@ -201,19 +209,19 @@ def test_index_rename(self): self.assert_eq(kdf.index.names, pdf.index.names) def test_multi_index_rename(self): - arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) kdf = ks.from_pandas(pdf) pmidx = pdf.index kmidx = kdf.index - self.assert_eq(kmidx.rename(['n', 'c']), pmidx.rename(['n', 'c'])) + self.assert_eq(kmidx.rename(["n", "c"]), pmidx.rename(["n", "c"])) self.assert_eq(kdf.index.names, pdf.index.names) - kmidx.rename(['num', 'col'], inplace=True) - pmidx.rename(['num', 'col'], inplace=True) + kmidx.rename(["num", "col"], inplace=True) + pmidx.rename(["num", "col"], inplace=True) self.assert_eq(kmidx, pmidx) self.assert_eq(kdf.index.names, pdf.index.names) @@ -221,12 +229,12 @@ def test_multi_index_rename(self): self.assert_eq(kmidx.rename([None, None]), pmidx.rename([None, None])) self.assert_eq(kdf.index.names, pdf.index.names) - self.assertRaises(TypeError, lambda: kmidx.rename('number')) - self.assertRaises(ValueError, lambda: kmidx.rename(['number'])) + self.assertRaises(TypeError, lambda: kmidx.rename("number")) + self.assertRaises(ValueError, lambda: kmidx.rename(["number"])) def test_multi_index_levshape(self): - pidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)]) - kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)]) + pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) + kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) self.assertEqual(pidx.levshape, kidx.levshape) def test_index_unique(self): @@ -245,111 +253,138 @@ def test_index_unique(self): kidx.unique(level=1) with self.assertRaisesRegexp(KeyError, "Requested level (hi)*"): - kidx.unique(level='hi') + kidx.unique(level="hi") def test_multi_index_copy(self): - arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.index.copy(), pdf.index.copy()) def test_index_symmetric_difference(self): - idx = ks.Index(['a', 'b', 'c']) - midx = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) + idx = ks.Index(["a", "b", "c"]) + midx = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) with self.assertRaisesRegexp(NotImplementedError, "Doesn't support*"): idx.symmetric_difference(midx) def test_multi_index_symmetric_difference(self): - idx = ks.Index(['a', 'b', 'c']) - midx = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) - midx_ = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) + idx = ks.Index(["a", "b", "c"]) + midx = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + midx_ = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) self.assert_eq( midx.symmetric_difference(midx_), - midx.to_pandas().symmetric_difference(midx_.to_pandas())) + midx.to_pandas().symmetric_difference(midx_.to_pandas()), + ) with self.assertRaisesRegexp(NotImplementedError, "Doesn't support*"): midx.symmetric_difference(idx) def test_missing(self): - kdf = ks.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) + kdf = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) # Index functions missing_functions = inspect.getmembers(_MissingPandasLikeIndex, inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)): - getattr(kdf.set_index('a').index, name)() - - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + PandasNotImplementedError, + "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(kdf.set_index("a").index, name)() + + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Index.*{}.*is deprecated".format(name)): - getattr(kdf.set_index('a').index, name)() + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) + ): + getattr(kdf.set_index("a").index, name)() # MultiIndex functions missing_functions = inspect.getmembers(_MissingPandasLikeMultiIndex, inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)): - getattr(kdf.set_index(['a', 'b']).index, name)() - - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + PandasNotImplementedError, + "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(kdf.set_index(["a", "b"]).index, name)() + + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Index.*{}.*is deprecated".format(name)): - getattr(kdf.set_index(['a', 'b']).index, name)() + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) + ): + getattr(kdf.set_index(["a", "b"]).index, name)() # Index properties - missing_properties = inspect.getmembers(_MissingPandasLikeIndex, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeIndex, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)): - getattr(kdf.set_index('a').index, name) - - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + PandasNotImplementedError, + "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(kdf.set_index("a").index, name) + + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Index.*{}.*is deprecated".format(name)): - getattr(kdf.set_index('a').index, name) + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) + ): + getattr(kdf.set_index("a").index, name) # MultiIndex properties - missing_properties = inspect.getmembers(_MissingPandasLikeMultiIndex, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeMultiIndex, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)): - getattr(kdf.set_index(['a', 'b']).index, name) - - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + PandasNotImplementedError, + "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(kdf.set_index(["a", "b"]).index, name) + + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Index.*{}.*is deprecated".format(name)): - getattr(kdf.set_index(['a', 'b']).index, name) + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) + ): + getattr(kdf.set_index(["a", "b"]).index, name) def test_index_has_duplicates(self): indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)] - names = [None, 'ks', 'ks', None] + names = [None, "ks", "ks", None] has_dup = [False, True, True, False] for idx, name, expected in zip(indexes, names, has_dup): @@ -359,8 +394,12 @@ def test_index_has_duplicates(self): self.assertEqual(kdf.index.has_duplicates, expected) def test_multiindex_has_duplicates(self): - indexes = [[list("abc"), list("edf")], [list("aac"), list("edf")], - [list("aac"), list("eef")], [[1, 4, 4], [4, 6, 6]]] + indexes = [ + [list("abc"), list("edf")], + [list("aac"), list("edf")], + [list("aac"), list("eef")], + [[1, 4, 4], [4, 6, 6]], + ] has_dup = [False, False, True, True] for idx, expected in zip(indexes, has_dup): @@ -370,85 +409,84 @@ def test_multiindex_has_duplicates(self): self.assertEqual(kdf.index.has_duplicates, expected) def test_multi_index_not_supported(self): - kdf = ks.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) + kdf = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - with self.assertRaisesRegex(TypeError, - "cannot perform any with this index type"): - kdf.set_index(['a', 'b']).index.any() + with self.assertRaisesRegex(TypeError, "cannot perform any with this index type"): + kdf.set_index(["a", "b"]).index.any() - with self.assertRaisesRegex(TypeError, - "cannot perform all with this index type"): - kdf.set_index(['a', 'b']).index.all() + with self.assertRaisesRegex(TypeError, "cannot perform all with this index type"): + kdf.set_index(["a", "b"]).index.all() def test_index_nlevels(self): - pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(['a', 'b', 'c'])) + pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(["a", "b", "c"])) kdf = ks.from_pandas(pdf) self.assertEqual(kdf.index.nlevels, 1) def test_multiindex_nlevel(self): - pdf = pd.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')]) + pdf = pd.DataFrame({"a": [1, 2, 3]}, index=[list("abc"), list("def")]) kdf = ks.from_pandas(pdf) self.assertEqual(kdf.index.nlevels, 2) def test_multiindex_from_arrays(self): - arrays = [['a', 'a', 'b', 'b'], ['red', 'blue', 'red', 'blue']] + arrays = [["a", "a", "b", "b"], ["red", "blue", "red", "blue"]] pidx = pd.MultiIndex.from_arrays(arrays) kidx = ks.MultiIndex.from_arrays(arrays) self.assert_eq(pidx, kidx) def test_multiindex_swaplevel(self): - pidx = pd.MultiIndex.from_arrays([['a', 'b'], [1, 2]]) - kidx = ks.MultiIndex.from_arrays([['a', 'b'], [1, 2]]) + pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]]) + kidx = ks.MultiIndex.from_arrays([["a", "b"], [1, 2]]) self.assert_eq(pidx.swaplevel(0, 1), kidx.swaplevel(0, 1)) - pidx = pd.MultiIndex.from_arrays([['a', 'b'], [1, 2]], names=['word', 'number']) - kidx = ks.MultiIndex.from_arrays([['a', 'b'], [1, 2]], names=['word', 'number']) + pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", "number"]) + kidx = ks.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", "number"]) self.assert_eq(pidx.swaplevel(0, 1), kidx.swaplevel(0, 1)) - pidx = pd.MultiIndex.from_arrays([['a', 'b'], [1, 2]], names=['word', None]) - kidx = ks.MultiIndex.from_arrays([['a', 'b'], [1, 2]], names=['word', None]) + pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", None]) + kidx = ks.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", None]) self.assert_eq(pidx.swaplevel(-2, -1), kidx.swaplevel(-2, -1)) self.assert_eq(pidx.swaplevel(0, 1), kidx.swaplevel(0, 1)) - self.assert_eq(pidx.swaplevel('word', 1), kidx.swaplevel('word', 1)) + self.assert_eq(pidx.swaplevel("word", 1), kidx.swaplevel("word", 1)) with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - kidx.swaplevel(-3, 'word') + kidx.swaplevel(-3, "word") with self.assertRaisesRegex(IndexError, "Too many levels: Index"): kidx.swaplevel(0, 2) with self.assertRaisesRegex(IndexError, "Too many levels: Index"): kidx.swaplevel(0, -3) with self.assertRaisesRegex(KeyError, "Level work not found"): - kidx.swaplevel(0, 'work') + kidx.swaplevel(0, "work") def test_index_fillna(self): - pidx = pd.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, None]).index - kidx = ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, None]).index + pidx = pd.DataFrame({"a": ["a", "b", "c"]}, index=[1, 2, None]).index + kidx = ks.DataFrame({"a": ["a", "b", "c"]}, index=[1, 2, None]).index self.assert_eq(pidx.fillna(0), kidx.fillna(0)) - self.assert_eq(pidx.rename('name').fillna(0), kidx.rename('name').fillna(0)) + self.assert_eq(pidx.rename("name").fillna(0), kidx.rename("name").fillna(0)) with self.assertRaisesRegex(TypeError, "Unsupported type "): kidx.fillna([1, 2]) def test_index_drop(self): - pidx = pd.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, 3]).index - kidx = ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, 3]).index + pidx = pd.DataFrame({"a": ["a", "b", "c"]}, index=[1, 2, 3]).index + kidx = ks.DataFrame({"a": ["a", "b", "c"]}, index=[1, 2, 3]).index self.assert_eq(pidx.drop(1), kidx.drop(1)) self.assert_eq(pidx.drop([1, 2]), kidx.drop([1, 2])) def test_multiindex_drop(self): - pidx = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')], - names=['level1', 'level2']) - kidx = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')], - names=['level1', 'level2']) - self.assert_eq(pidx.drop('a'), kidx.drop('a')) - self.assert_eq(pidx.drop(['a', 'b']), kidx.drop(['a', 'b'])) - self.assert_eq(pidx.drop(['x', 'y'], level='level2'), - kidx.drop(['x', 'y'], level='level2')) + pidx = pd.MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "z")], names=["level1", "level2"] + ) + kidx = ks.MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "z")], names=["level1", "level2"] + ) + self.assert_eq(pidx.drop("a"), kidx.drop("a")) + self.assert_eq(pidx.drop(["a", "b"]), kidx.drop(["a", "b"])) + self.assert_eq(pidx.drop(["x", "y"], level="level2"), kidx.drop(["x", "y"], level="level2")) def test_sort_values(self): pidx = pd.Index([-10, -100, 200, 100]) @@ -457,17 +495,17 @@ def test_sort_values(self): self.assert_eq(pidx.sort_values(), kidx.sort_values()) self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) - pidx.name = 'koalas' - kidx.name = 'koalas' + pidx.name = "koalas" + kidx.name = "koalas" self.assert_eq(pidx.sort_values(), kidx.sort_values()) self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) - pidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) - kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - pidx.names = ['hello', 'koalas', 'goodbye'] - kidx.names = ['hello', 'koalas', 'goodbye'] + pidx.names = ["hello", "koalas", "goodbye"] + kidx.names = ["hello", "koalas", "goodbye"] self.assert_eq(pidx.sort_values(), kidx.sort_values()) self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) @@ -477,44 +515,36 @@ def test_index_drop_duplicates(self): kidx = ks.Index([1, 1, 2]) self.assert_eq(pidx.drop_duplicates().sort_values(), kidx.drop_duplicates().sort_values()) - pidx = pd.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=['level1', 'level2']) - kidx = ks.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=['level1', 'level2']) + pidx = pd.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=["level1", "level2"]) + kidx = ks.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=["level1", "level2"]) self.assert_eq(pidx.drop_duplicates().sort_values(), kidx.drop_duplicates().sort_values()) def test_index_sort(self): idx = ks.Index([1, 2, 3, 4, 5]) - midx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)]) + midx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) with self.assertRaisesRegex( - TypeError, - "cannot sort an Index object in-place, use sort_values instead"): + TypeError, "cannot sort an Index object in-place, use sort_values instead" + ): idx.sort() with self.assertRaisesRegex( - TypeError, - "cannot sort an Index object in-place, use sort_values instead"): + TypeError, "cannot sort an Index object in-place, use sort_values instead" + ): midx.sort() def test_multiindex_isna(self): - kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - with self.assertRaisesRegex( - NotImplementedError, - "isna is not defined for MultiIndex"): + with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): kidx.isna() - with self.assertRaisesRegex( - NotImplementedError, - "isna is not defined for MultiIndex"): + with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): kidx.isnull() - with self.assertRaisesRegex( - NotImplementedError, - "notna is not defined for MultiIndex"): + with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): kidx.notna() - with self.assertRaisesRegex( - NotImplementedError, - "notna is not defined for MultiIndex"): + with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): kidx.notnull() def test_index_nunique(self): @@ -525,71 +555,69 @@ def test_index_nunique(self): self.assert_eq(pidx.nunique(dropna=True), kidx.nunique(dropna=True)) def test_multiindex_nunique(self): - kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) - with self.assertRaisesRegex( - NotImplementedError, - "notna is not defined for MultiIndex"): + kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): kidx.notnull() def test_multiindex_rename(self): - pidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) - kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - pidx = pidx.rename(list('ABC')) - kidx = kidx.rename(list('ABC')) + pidx = pidx.rename(list("ABC")) + kidx = kidx.rename(list("ABC")) self.assert_eq(pidx, kidx) - pidx = pidx.rename(['my', 'name', 'is']) - kidx = kidx.rename(['my', 'name', 'is']) + pidx = pidx.rename(["my", "name", "is"]) + kidx = kidx.rename(["my", "name", "is"]) self.assert_eq(pidx, kidx) def test_multiindex_set_names(self): - pidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) - kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - pidx = pidx.set_names(['set', 'new', 'names']) - kidx = kidx.set_names(['set', 'new', 'names']) + pidx = pidx.set_names(["set", "new", "names"]) + kidx = kidx.set_names(["set", "new", "names"]) self.assert_eq(pidx, kidx) - pidx.set_names(['set', 'new', 'names'], inplace=True) - kidx.set_names(['set', 'new', 'names'], inplace=True) + pidx.set_names(["set", "new", "names"], inplace=True) + kidx.set_names(["set", "new", "names"], inplace=True) self.assert_eq(pidx, kidx) - pidx = pidx.set_names('first', level=0) - kidx = kidx.set_names('first', level=0) + pidx = pidx.set_names("first", level=0) + kidx = kidx.set_names("first", level=0) self.assert_eq(pidx, kidx) - pidx = pidx.set_names('second', level=1) - kidx = kidx.set_names('second', level=1) + pidx = pidx.set_names("second", level=1) + kidx = kidx.set_names("second", level=1) self.assert_eq(pidx, kidx) - pidx = pidx.set_names('third', level=2) - kidx = kidx.set_names('third', level=2) + pidx = pidx.set_names("third", level=2) + kidx = kidx.set_names("third", level=2) self.assert_eq(pidx, kidx) - pidx.set_names('first', level=0, inplace=True) - kidx.set_names('first', level=0, inplace=True) + pidx.set_names("first", level=0, inplace=True) + kidx.set_names("first", level=0, inplace=True) self.assert_eq(pidx, kidx) - pidx.set_names('second', level=1, inplace=True) - kidx.set_names('second', level=1, inplace=True) + pidx.set_names("second", level=1, inplace=True) + kidx.set_names("second", level=1, inplace=True) self.assert_eq(pidx, kidx) - pidx.set_names('third', level=2, inplace=True) - kidx.set_names('third', level=2, inplace=True) + pidx.set_names("third", level=2, inplace=True) + kidx.set_names("third", level=2, inplace=True) self.assert_eq(pidx, kidx) def test_multiindex_from_product(self): - iterables = [[0, 1, 2], ['green', 'purple']] + iterables = [[0, 1, 2], ["green", "purple"]] pidx = pd.MultiIndex.from_product(iterables) kidx = ks.MultiIndex.from_product(iterables) self.assert_eq(pidx, kidx) def test_multiindex_tuple_column_name(self): - column_labels = pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]) + column_labels = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=column_labels) - pdf.set_index(('a', 'x'), append=True, inplace=True) + pdf.set_index(("a", "x"), append=True, inplace=True) kdf = ks.from_pandas(pdf) self.assert_eq(pdf, kdf) @@ -599,8 +627,8 @@ def test_len(self): self.assert_eq(len(pidx), len(kidx)) - pidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) - kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) self.assert_eq(len(pidx), len(kidx)) @@ -609,109 +637,77 @@ def test_append(self): pidx = pd.Index(range(10000)) kidx = ks.Index(range(10000)) - self.assert_eq( - pidx.append(pidx), - kidx.append(kidx)) + self.assert_eq(pidx.append(pidx), kidx.append(kidx)) # Index with name - pidx1 = pd.Index(range(10000), name='a') - pidx2 = pd.Index(range(10000), name='b') - kidx1 = ks.Index(range(10000), name='a') - kidx2 = ks.Index(range(10000), name='b') + pidx1 = pd.Index(range(10000), name="a") + pidx2 = pd.Index(range(10000), name="b") + kidx1 = ks.Index(range(10000), name="a") + kidx2 = ks.Index(range(10000), name="b") - self.assert_eq( - pidx1.append(pidx2), - kidx1.append(kidx2)) + self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) - self.assert_eq( - pidx2.append(pidx1), - kidx2.append(kidx1)) + self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # Index from DataFrame - pdf1 = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6]}, - index=['a', 'b', 'c']) - pdf2 = pd.DataFrame({ - 'a': [7, 8, 9], - 'd': [10, 11, 12]}, - index=['x', 'y', 'z']) + pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) + pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]}, index=["x", "y", "z"]) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - pidx1 = pdf1.set_index('a').index - pidx2 = pdf2.set_index('d').index - kidx1 = kdf1.set_index('a').index - kidx2 = kdf2.set_index('d').index + pidx1 = pdf1.set_index("a").index + pidx2 = pdf2.set_index("d").index + kidx1 = kdf1.set_index("a").index + kidx2 = kdf2.set_index("d").index - self.assert_eq( - pidx1.append(pidx2), - kidx1.append(kidx2)) + self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) - self.assert_eq( - pidx2.append(pidx1), - kidx2.append(kidx1)) + self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # Index from DataFrame with MultiIndex columns - pdf1 = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6]}) - pdf2 = pd.DataFrame({ - 'a': [7, 8, 9], - 'd': [10, 11, 12]}) - pdf1.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y')]) - pdf2.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('d', 'y')]) + pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]}) + pdf1.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) + pdf2.columns = pd.MultiIndex.from_tuples([("a", "x"), ("d", "y")]) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - pidx1 = pdf1.set_index(('a', 'x')).index - pidx2 = pdf2.set_index(('d', 'y')).index - kidx1 = kdf1.set_index(('a', 'x')).index - kidx2 = kdf2.set_index(('d', 'y')).index + pidx1 = pdf1.set_index(("a", "x")).index + pidx2 = pdf2.set_index(("d", "y")).index + kidx1 = kdf1.set_index(("a", "x")).index + kidx2 = kdf2.set_index(("d", "y")).index - self.assert_eq( - pidx1.append(pidx2), - kidx1.append(kidx2)) + self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) - self.assert_eq( - pidx2.append(pidx1), - kidx2.append(kidx1)) + self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # MultiIndex - pmidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) - kmidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kmidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) self.assert_eq(pmidx.append(pmidx), kmidx.append(kmidx)) # MultiIndex with names pmidx1 = pd.MultiIndex.from_tuples( - [('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)], - names=['x', 'y', 'z']) + [("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["x", "y", "z"] + ) pmidx2 = pd.MultiIndex.from_tuples( - [('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)], - names=['p', 'q', 'r']) + [("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["p", "q", "r"] + ) kmidx1 = ks.MultiIndex.from_tuples( - [('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)], - names=['x', 'y', 'z']) + [("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["x", "y", "z"] + ) kmidx2 = ks.MultiIndex.from_tuples( - [('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)], - names=['p', 'q', 'r']) + [("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["p", "q", "r"] + ) - self.assert_eq( - pmidx1.append(pmidx2), - kmidx1.append(kmidx2)) + self.assert_eq(pmidx1.append(pmidx2), kmidx1.append(kmidx2)) - self.assert_eq( - pmidx2.append(pmidx1), - kmidx2.append(kmidx1)) + self.assert_eq(pmidx2.append(pmidx1), kmidx2.append(kmidx1)) - self.assert_eq( - pmidx1.append(pmidx2).names, - kmidx1.append(kmidx2).names) + self.assert_eq(pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) - self.assert_eq( - pmidx1.append(pmidx2).names, - kmidx1.append(kmidx2).names) + self.assert_eq(pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) # Index & MultiIndex currently is not supported expected_error_message = r"append\(\) between Index & MultiIndex currently is not supported" @@ -727,10 +723,10 @@ def test_argmin(self): self.assert_eq(pidx.argmin(), kidx.argmin()) # MultiIndex - kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex( - TypeError, - "reduction operation 'argmin' not allowed for this dtype"): + TypeError, "reduction operation 'argmin' not allowed for this dtype" + ): kidx.argmin() def test_argmax(self): @@ -740,10 +736,10 @@ def test_argmax(self): self.assert_eq(pidx.argmax(), kidx.argmax()) # MultiIndex - kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex( - TypeError, - "reduction operation 'argmax' not allowed for this dtype"): + TypeError, "reduction operation 'argmax' not allowed for this dtype" + ): kidx.argmax() def test_monotonic(self): @@ -751,13 +747,13 @@ def test_monotonic(self): datas = [] # increasing / decreasing ordered each index level with string - datas.append([('w', 'a'), ('x', 'b'), ('y', 'c'), ('z', 'd')]) - datas.append([('w', 'd'), ('x', 'c'), ('y', 'b'), ('z', 'a')]) - datas.append([('z', 'a'), ('y', 'b'), ('x', 'c'), ('w', 'd')]) - datas.append([('z', 'd'), ('y', 'c'), ('x', 'b'), ('w', 'a')]) + datas.append([("w", "a"), ("x", "b"), ("y", "c"), ("z", "d")]) + datas.append([("w", "d"), ("x", "c"), ("y", "b"), ("z", "a")]) + datas.append([("z", "a"), ("y", "b"), ("x", "c"), ("w", "d")]) + datas.append([("z", "d"), ("y", "c"), ("x", "b"), ("w", "a")]) # mixed order each index level with string - datas.append([('z', 'a'), ('x', 'b'), ('y', 'c'), ('w', 'd')]) - datas.append([('z', 'a'), ('y', 'c'), ('x', 'b'), ('w', 'd')]) + datas.append([("z", "a"), ("x", "b"), ("y", "c"), ("w", "d")]) + datas.append([("z", "a"), ("y", "c"), ("x", "b"), ("w", "d")]) # increasing / decreasing ordered each index level with integer datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (5, 500)]) @@ -769,12 +765,12 @@ def test_monotonic(self): datas.append([(1, 100), (2, 300), (3, 200), (4, 400), (5, 500)]) # integer / negative mixed tests - datas.append([('a', -500), ('b', -400), ('c', -300), ('d', -200), ('e', -100)]) - datas.append([('e', -500), ('d', -400), ('c', -300), ('b', -200), ('a', -100)]) - datas.append([(-5, 'a'), (-4, 'b'), (-3, 'c'), (-2, 'd'), (-1, 'e')]) - datas.append([(-5, 'e'), (-4, 'd'), (-3, 'c'), (-2, 'b'), (-1, 'a')]) - datas.append([(-5, 'e'), (-3, 'd'), (-2, 'c'), (-4, 'b'), (-1, 'a')]) - datas.append([(-5, 'e'), (-4, 'c'), (-3, 'b'), (-2, 'd'), (-1, 'a')]) + datas.append([("a", -500), ("b", -400), ("c", -300), ("d", -200), ("e", -100)]) + datas.append([("e", -500), ("d", -400), ("c", -300), ("b", -200), ("a", -100)]) + datas.append([(-5, "a"), (-4, "b"), (-3, "c"), (-2, "d"), (-1, "e")]) + datas.append([(-5, "e"), (-4, "d"), (-3, "c"), (-2, "b"), (-1, "a")]) + datas.append([(-5, "e"), (-3, "d"), (-2, "c"), (-4, "b"), (-1, "a")]) + datas.append([(-5, "e"), (-4, "c"), (-3, "b"), (-2, "d"), (-1, "a")]) # None type tests (None type is treated as the largets value) # TODO: the commented tests below should be uncommented after fixing for pandas >= 1.0.0 @@ -788,32 +784,28 @@ def test_monotonic(self): datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)]) # datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, None)]) datas.append([(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)]) - datas.append([(None, 'e'), (None, 'c'), (None, 'b'), (None, 'd'), (None, 'a')]) + datas.append([(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")]) datas.append([(None, None), (None, None), (None, None), (None, None), (None, None)]) # duplicated index value tests # TODO: the commented test below should be uncommented after fixing for pandas >= 1.0.0 - datas.append([('x', 'd'), ('y', 'c'), ('y', 'b'), ('z', 'a')]) - datas.append([('x', 'd'), ('y', 'b'), ('y', 'c'), ('z', 'a')]) - datas.append([('x', 'd'), ('y', 'c'), ('y', None), ('z', 'a')]) + datas.append([("x", "d"), ("y", "c"), ("y", "b"), ("z", "a")]) + datas.append([("x", "d"), ("y", "b"), ("y", "c"), ("z", "a")]) + datas.append([("x", "d"), ("y", "c"), ("y", None), ("z", "a")]) # datas.append([('x', 'd'), ('y', None), ('y', 'c'), ('z', 'a')]) - datas.append([('x', 'd'), ('y', None), ('y', None), ('z', 'a')]) - datas.append([('x', 'd'), ('y', 'c'), ('y', 'b'), (None, 'a')]) - datas.append([('x', 'd'), ('y', 'b'), ('y', 'c'), (None, 'a')]) + datas.append([("x", "d"), ("y", None), ("y", None), ("z", "a")]) + datas.append([("x", "d"), ("y", "c"), ("y", "b"), (None, "a")]) + datas.append([("x", "d"), ("y", "b"), ("y", "c"), (None, "a")]) # more depth tests - datas.append([('x', 'd', 'o'), ('y', 'c', 'p'), ('y', 'c', 'q'), ('z', 'a', 'r')]) - datas.append([('x', 'd', 'o'), ('y', 'c', 'q'), ('y', 'c', 'p'), ('z', 'a', 'r')]) + datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", "q"), ("z", "a", "r")]) + datas.append([("x", "d", "o"), ("y", "c", "q"), ("y", "c", "p"), ("z", "a", "r")]) # datas.append([('x', 'd', 'o'), ('y', 'c', None), ('y', 'c', 'q'), ('z', 'a', 'r')]) - datas.append([('x', 'd', 'o'), ('y', 'c', 'p'), ('y', 'c', None), ('z', 'a', 'r')]) - datas.append([('x', 'd', 'o'), ('y', 'c', None), ('y', 'c', None), ('z', 'a', 'r')]) + datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", None), ("z", "a", "r")]) + datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", None), ("z", "a", "r")]) for i, data in enumerate(datas): kmidx = ks.MultiIndex.from_tuples(data) pmidx = kmidx.to_pandas() - self.assert_eq( - kmidx.is_monotonic_increasing, - pmidx.is_monotonic_increasing) - self.assert_eq( - kmidx.is_monotonic_decreasing, - pmidx.is_monotonic_decreasing) + self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py index 3839768..df28cac 100644 --- a/databricks/koalas/tests/test_indexing.py +++ b/databricks/koalas/tests/test_indexing.py @@ -26,32 +26,31 @@ class BasicIndexingTest(ComparisonTestBase): - @property def pdf(self): - return pd.DataFrame({'month': [1, 4, 7, 10], - 'year': [2012, 2014, 2013, 2014], - 'sale': [55, 40, 84, 31]}) + return pd.DataFrame( + {"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]} + ) @compare_both(almost=False) def test_indexing(self, df): - df1 = df.set_index('month') + df1 = df.set_index("month") yield df1 - yield df.set_index('month', drop=False) - yield df.set_index('month', append=True) - yield df.set_index(['year', 'month']) - yield df.set_index(['year', 'month'], drop=False) - yield df.set_index(['year', 'month'], append=True) + yield df.set_index("month", drop=False) + yield df.set_index("month", append=True) + yield df.set_index(["year", "month"]) + yield df.set_index(["year", "month"], drop=False) + yield df.set_index(["year", "month"], append=True) - yield df1.set_index('year', drop=False, append=True) + yield df1.set_index("year", drop=False, append=True) df2 = df1.copy() - df2.set_index('year', append=True, inplace=True) + df2.set_index("year", append=True, inplace=True) yield df2 - self.assertRaisesRegex(KeyError, 'unknown', lambda: df.set_index('unknown')) - self.assertRaisesRegex(KeyError, 'unknown', lambda: df.set_index(['month', 'unknown'])) + self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index("unknown")) + self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index(["month", "unknown"])) for d in [df, df1, df2]: yield d.reset_index() @@ -60,20 +59,26 @@ def test_indexing(self, df): yield df1.reset_index(level=0) yield df2.reset_index(level=1) yield df2.reset_index(level=[1, 0]) - yield df1.reset_index(level='month') - yield df2.reset_index(level='year') - yield df2.reset_index(level=['month', 'year']) - yield df2.reset_index(level='month', drop=True) - yield df2.reset_index(level=['month', 'year'], drop=True) - - self.assertRaisesRegex(IndexError, 'Too many levels: Index has only 1 level, not 3', - lambda: df1.reset_index(level=2)) - self.assertRaisesRegex(IndexError, 'Too many levels: Index has only 1 level, not 4', - lambda: df1.reset_index(level=[3, 2])) - self.assertRaisesRegex(KeyError, 'unknown.*month', - lambda: df1.reset_index(level='unknown')) - self.assertRaisesRegex(KeyError, 'Level unknown not found', - lambda: df2.reset_index(level='unknown')) + yield df1.reset_index(level="month") + yield df2.reset_index(level="year") + yield df2.reset_index(level=["month", "year"]) + yield df2.reset_index(level="month", drop=True) + yield df2.reset_index(level=["month", "year"], drop=True) + + self.assertRaisesRegex( + IndexError, + "Too many levels: Index has only 1 level, not 3", + lambda: df1.reset_index(level=2), + ) + self.assertRaisesRegex( + IndexError, + "Too many levels: Index has only 1 level, not 4", + lambda: df1.reset_index(level=[3, 2]), + ) + self.assertRaisesRegex(KeyError, "unknown.*month", lambda: df1.reset_index(level="unknown")) + self.assertRaisesRegex( + KeyError, "Level unknown not found", lambda: df2.reset_index(level="unknown") + ) df3 = df2.copy() df3.reset_index(inplace=True) @@ -83,13 +88,15 @@ def test_indexing(self, df): yield df1.sale.reset_index(level=0) yield df2.sale.reset_index(level=[1, 0]) yield df1.sale.reset_index(drop=True) - yield df1.sale.reset_index(name='s') - yield df1.sale.reset_index(name='s', drop=True) + yield df1.sale.reset_index(name="s") + yield df1.sale.reset_index(name="s", drop=True) s = df1.sale - self.assertRaisesRegex(TypeError, - 'Cannot reset_index inplace on a Series to create a DataFrame', - lambda: s.reset_index(inplace=True)) + self.assertRaisesRegex( + TypeError, + "Cannot reset_index inplace on a Series to create a DataFrame", + lambda: s.reset_index(inplace=True), + ) s.reset_index(drop=True, inplace=True) yield s yield df1 @@ -97,27 +104,29 @@ def test_indexing(self, df): def test_from_pandas_with_explicit_index(self): pdf = self.pdf - df1 = ks.from_pandas(pdf.set_index('month')) - self.assertPandasEqual(df1.toPandas(), pdf.set_index('month')) + df1 = ks.from_pandas(pdf.set_index("month")) + self.assertPandasEqual(df1.toPandas(), pdf.set_index("month")) - df2 = ks.from_pandas(pdf.set_index(['year', 'month'])) - self.assertPandasEqual(df2.toPandas(), pdf.set_index(['year', 'month'])) + df2 = ks.from_pandas(pdf.set_index(["year", "month"])) + self.assertPandasEqual(df2.toPandas(), pdf.set_index(["year", "month"])) def test_limitations(self): - df = self.kdf.set_index('month') + df = self.kdf.set_index("month") - self.assertRaisesRegex(ValueError, 'Level should be all int or all string.', - lambda: df.reset_index([1, 'month'])) + self.assertRaisesRegex( + ValueError, + "Level should be all int or all string.", + lambda: df.reset_index([1, "month"]), + ) class IndexingTest(ReusedSQLTestCase): - @property def pdf(self): - return pd.DataFrame({ - 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], - 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0] - }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) @property def kdf(self): @@ -128,89 +137,89 @@ def test_at(self): kdf = self.kdf # Create the equivalent of pdf.loc[3] as a Koalas Series # This is necessary because .loc[n] does not currently work with Koalas DataFrames (#383) - test_series = ks.Series([3, 6], index=['a', 'b'], name='3') + test_series = ks.Series([3, 6], index=["a", "b"], name="3") # Assert invalided signatures raise TypeError with self.assertRaises(TypeError, msg="Use DataFrame.at like .at[row_index, column_name]"): kdf.at[3] with self.assertRaises(TypeError, msg="Use DataFrame.at like .at[row_index, column_name]"): - kdf.at['ab'] # 'ab' is of length 2 but str type instead of tuple + kdf.at["ab"] # 'ab' is of length 2 but str type instead of tuple with self.assertRaises(TypeError, msg="Use Series.at like .at[column_name]"): - test_series.at[3, 'b'] + test_series.at[3, "b"] # Assert .at for DataFrames - self.assertEqual(kdf.at[3, 'b'], 6) - self.assertEqual(kdf.at[3, 'b'], pdf.at[3, 'b']) - np.testing.assert_array_equal(kdf.at[9, 'b'], np.array([0, 0, 0])) - np.testing.assert_array_equal(kdf.at[9, 'b'], pdf.at[9, 'b']) + self.assertEqual(kdf.at[3, "b"], 6) + self.assertEqual(kdf.at[3, "b"], pdf.at[3, "b"]) + np.testing.assert_array_equal(kdf.at[9, "b"], np.array([0, 0, 0])) + np.testing.assert_array_equal(kdf.at[9, "b"], pdf.at[9, "b"]) # Assert .at for Series - self.assertEqual(test_series.at['b'], 6) - self.assertEqual(test_series.at['b'], pdf.loc[3].at['b']) + self.assertEqual(test_series.at["b"], 6) + self.assertEqual(test_series.at["b"], pdf.loc[3].at["b"]) # Assert multi-character indices - self.assertEqual(ks.Series([0, 1], index=['ab', 'cd']).at['ab'], - pd.Series([0, 1], index=['ab', 'cd']).at['ab']) + self.assertEqual( + ks.Series([0, 1], index=["ab", "cd"]).at["ab"], + pd.Series([0, 1], index=["ab", "cd"]).at["ab"], + ) # Assert invalid column or index names result in a KeyError like with pandas - with self.assertRaises(KeyError, msg='x'): - kdf.at[3, 'x'] + with self.assertRaises(KeyError, msg="x"): + kdf.at[3, "x"] with self.assertRaises(KeyError, msg=99): - kdf.at[99, 'b'] + kdf.at[99, "b"] with self.assertRaises(ValueError): - kdf.at[(3, 6), 'b'] + kdf.at[(3, 6), "b"] with self.assertRaises(KeyError): - kdf.at[3, ('x', 'b')] + kdf.at[3, ("x", "b")] # Assert setting values fails with self.assertRaises(TypeError): - kdf.at[3, 'b'] = 10 + kdf.at[3, "b"] = 10 def test_at_multiindex(self): - pdf = self.pdf.set_index('b', append=True) - kdf = self.kdf.set_index('b', append=True) + pdf = self.pdf.set_index("b", append=True) + kdf = self.kdf.set_index("b", append=True) - self.assert_eq(kdf.at[(3, 6), 'a'], pdf.at[(3, 6), 'a']) - self.assert_eq(kdf.at[(3,), 'a'], pdf.at[(3,), 'a']) - self.assert_eq(list(kdf.at[(9, 0), 'a']), list(pdf.at[(9, 0), 'a'])) - self.assert_eq(list(kdf.at[(9,), 'a']), list(pdf.at[(9,), 'a'])) + self.assert_eq(kdf.at[(3, 6), "a"], pdf.at[(3, 6), "a"]) + self.assert_eq(kdf.at[(3,), "a"], pdf.at[(3,), "a"]) + self.assert_eq(list(kdf.at[(9, 0), "a"]), list(pdf.at[(9, 0), "a"])) + self.assert_eq(list(kdf.at[(9,), "a"]), list(pdf.at[(9,), "a"])) with self.assertRaises(ValueError): - kdf.at[3, 'a'] + kdf.at[3, "a"] def test_at_multiindex_columns(self): - arrays = [np.array(['bar', 'bar', 'baz', 'baz']), - np.array(['one', 'two', 'one', 'two'])] + arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])] - pdf = pd.DataFrame(np.random.randn(3, 4), index=['A', 'B', 'C'], columns=arrays) + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.at['B', ('bar', 'one')], pdf.at['B', ('bar', 'one')]) + self.assert_eq(kdf.at["B", ("bar", "one")], pdf.at["B", ("bar", "one")]) with self.assertRaises(KeyError): - kdf.at['B', 'bar'] + kdf.at["B", "bar"] def test_iat(self): pdf = self.pdf kdf = self.kdf # Create the equivalent of pdf.loc[3] as a Koalas Series # This is necessary because .loc[n] does not currently work with Koalas DataFrames (#383) - test_series = ks.Series([3, 6], index=['a', 'b'], name='3') + test_series = ks.Series([3, 6], index=["a", "b"], name="3") # Assert invalided signatures raise TypeError with self.assertRaises( - TypeError, - msg="Use DataFrame.at like .iat[row_interget_position, column_integer_position]"): + TypeError, + msg="Use DataFrame.at like .iat[row_interget_position, column_integer_position]", + ): kdf.iat[3] with self.assertRaises( - ValueError, - msg="iAt based indexing on multi-index can only have tuple values"): - kdf.iat[3, 'b'] # 'ab' is of length 2 but str type instead of tuple - with self.assertRaises( - TypeError, - msg="Use Series.iat like .iat[row_integer_position]"): - test_series.iat[3, 'b'] + ValueError, msg="iAt based indexing on multi-index can only have tuple values" + ): + kdf.iat[3, "b"] # 'ab' is of length 2 but str type instead of tuple + with self.assertRaises(TypeError, msg="Use Series.iat like .iat[row_integer_position]"): + test_series.iat[3, "b"] # Assert .iat for DataFrames self.assertEqual(kdf.iat[7, 0], 8) @@ -236,19 +245,18 @@ def test_iat(self): kdf.iat[4, 1] = 10 def test_iat_multiindex(self): - pdf = self.pdf.set_index('b', append=True) - kdf = self.kdf.set_index('b', append=True) + pdf = self.pdf.set_index("b", append=True) + kdf = self.kdf.set_index("b", append=True) self.assert_eq(kdf.iat[7, 0], pdf.iat[7, 0]) with self.assertRaises(ValueError): - kdf.iat[3, 'a'] + kdf.iat[3, "a"] def test_iat_multiindex_columns(self): - arrays = [np.array(['bar', 'bar', 'baz', 'baz']), - np.array(['one', 'two', 'one', 'two'])] + arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])] - pdf = pd.DataFrame(np.random.randn(3, 4), index=['A', 'B', 'C'], columns=arrays) + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.iat[1, 3], pdf.iat[1, 3]) @@ -299,9 +307,7 @@ def test_loc(self): self.assertRaises(KeyError, lambda: kdf.a.loc[10]) # monotonically increasing index test - pdf = pd.DataFrame( - {'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]}, - index=[0, 1, 1, 2, 2, 2, 4, 5, 6]) + pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, index=[0, 1, 1, 2, 2, 2, 4, 5, 6]) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.loc[:2], pdf.loc[:2]) @@ -313,9 +319,7 @@ def test_loc(self): self.assert_eq(kdf.loc[3:10], pdf.loc[3:10]) # monotonically decreasing index test - pdf = pd.DataFrame( - {'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]}, - index=[6, 5, 5, 4, 4, 4, 2, 1, 0]) + pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, index=[6, 5, 5, 4, 4, 4, 2, 1, 0]) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.loc[:4], pdf.loc[:4]) @@ -327,32 +331,30 @@ def test_loc(self): self.assert_eq(kdf.loc[10:3], pdf.loc[10:3]) # test when type of key is string and given value is not included in key - pdf = pd.DataFrame({'a': [1, 2, 3]}, index=['a', 'b', 'd']) + pdf = pd.DataFrame({"a": [1, 2, 3]}, index=["a", "b", "d"]) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.loc['a':'z'], pdf.loc['a':'z']) + self.assert_eq(kdf.loc["a":"z"], pdf.loc["a":"z"]) # KeyError when index is not monotonic increasing or decreasing # and specified values don't exist in index - kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], - index=['cobra', 'viper', 'sidewinder']) + kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], index=["cobra", "viper", "sidewinder"]) - self.assertRaises(KeyError, lambda: kdf.loc['cobra':'koalas']) - self.assertRaises(KeyError, lambda: kdf.loc['koalas':'viper']) + self.assertRaises(KeyError, lambda: kdf.loc["cobra":"koalas"]) + self.assertRaises(KeyError, lambda: kdf.loc["koalas":"viper"]) - kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], - index=[10, 30, 20]) + kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], index=[10, 30, 20]) self.assertRaises(KeyError, lambda: kdf.loc[0:30]) self.assertRaises(KeyError, lambda: kdf.loc[10:100]) def test_loc_non_informative_index(self): - pdf = pd.DataFrame({'x': [1, 2, 3, 4]}, index=[10, 20, 30, 40]) + pdf = pd.DataFrame({"x": [1, 2, 3, 4]}, index=[10, 20, 30, 40]) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.loc[20:30], pdf.loc[20:30]) - pdf = pd.DataFrame({'x': [1, 2, 3, 4]}, index=[10, 20, 20, 40]) + pdf = pd.DataFrame({"x": [1, 2, 3, 4]}, index=[10, 20, 20, 40]) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.loc[20:20], pdf.loc[20:20]) @@ -368,16 +370,16 @@ def test_loc_noindex(self): pdf = self.pdf pdf = pdf.reset_index() - self.assert_eq(kdf[['a']], pdf[['a']]) + self.assert_eq(kdf[["a"]], pdf[["a"]]) self.assert_eq(kdf.loc[:], pdf.loc[:]) self.assert_eq(kdf.loc[5:5], pdf.loc[5:5]) def test_loc_multiindex(self): kdf = self.kdf - kdf = kdf.set_index('b', append=True) + kdf = kdf.set_index("b", append=True) pdf = self.pdf - pdf = pdf.set_index('b', append=True) + pdf = pdf.set_index("b", append=True) self.assert_eq(kdf.loc[:], pdf.loc[:]) self.assertRaises(NotImplementedError, lambda: kdf.loc[5:5]) @@ -393,33 +395,33 @@ def test_loc_multiindex(self): def test_loc2d_multiindex(self): kdf = self.kdf - kdf = kdf.set_index('b', append=True) + kdf = kdf.set_index("b", append=True) pdf = self.pdf - pdf = pdf.set_index('b', append=True) + pdf = pdf.set_index("b", append=True) self.assert_eq(kdf.loc[:, :], pdf.loc[:, :]) - self.assert_eq(kdf.loc[:, 'a'], pdf.loc[:, 'a']) - self.assertRaises(NotImplementedError, lambda: kdf.loc[5:5, 'a']) + self.assert_eq(kdf.loc[:, "a"], pdf.loc[:, "a"]) + self.assertRaises(NotImplementedError, lambda: kdf.loc[5:5, "a"]) def test_loc2d(self): kdf = self.kdf pdf = self.pdf # index indexer is always regarded as slice for duplicated values - self.assert_eq(kdf.loc[5:5, 'a'], pdf.loc[5:5, 'a']) - self.assert_eq(kdf.loc[[5], 'a'], pdf.loc[[5], 'a']) - self.assert_eq(kdf.loc[5:5, ['a']], pdf.loc[5:5, ['a']]) - self.assert_eq(kdf.loc[[5], ['a']], pdf.loc[[5], ['a']]) + self.assert_eq(kdf.loc[5:5, "a"], pdf.loc[5:5, "a"]) + self.assert_eq(kdf.loc[[5], "a"], pdf.loc[[5], "a"]) + self.assert_eq(kdf.loc[5:5, ["a"]], pdf.loc[5:5, ["a"]]) + self.assert_eq(kdf.loc[[5], ["a"]], pdf.loc[[5], ["a"]]) self.assert_eq(kdf.loc[:, :], pdf.loc[:, :]) - self.assert_eq(kdf.loc[3:8, 'a'], pdf.loc[3:8, 'a']) - self.assert_eq(kdf.loc[:8, 'a'], pdf.loc[:8, 'a']) - self.assert_eq(kdf.loc[3:, 'a'], pdf.loc[3:, 'a']) - self.assert_eq(kdf.loc[[8], 'a'], pdf.loc[[8], 'a']) + self.assert_eq(kdf.loc[3:8, "a"], pdf.loc[3:8, "a"]) + self.assert_eq(kdf.loc[:8, "a"], pdf.loc[:8, "a"]) + self.assert_eq(kdf.loc[3:, "a"], pdf.loc[3:, "a"]) + self.assert_eq(kdf.loc[[8], "a"], pdf.loc[[8], "a"]) - self.assert_eq(kdf.loc[3:8, ['a']], pdf.loc[3:8, ['a']]) - self.assert_eq(kdf.loc[:8, ['a']], pdf.loc[:8, ['a']]) - self.assert_eq(kdf.loc[3:, ['a']], pdf.loc[3:, ['a']]) + self.assert_eq(kdf.loc[3:8, ["a"]], pdf.loc[3:8, ["a"]]) + self.assert_eq(kdf.loc[:8, ["a"]], pdf.loc[:8, ["a"]]) + self.assert_eq(kdf.loc[3:, ["a"]], pdf.loc[3:, ["a"]]) # TODO?: self.assert_eq(kdf.loc[[3, 4, 3], ['a']], pdf.loc[[3, 4, 3], ['a']]) self.assertRaises(SparkPandasIndexingError, lambda: kdf.loc[3, 3, 3]) @@ -427,76 +429,79 @@ def test_loc2d(self): self.assertRaises(SparkPandasIndexingError, lambda: kdf.a.loc[3:, 3]) self.assertRaises(SparkPandasIndexingError, lambda: kdf.a.loc[kdf.a % 2 == 0, 3]) - self.assert_eq(kdf.loc[5, 'a'], pdf.loc[5, 'a']) - self.assert_eq(kdf.loc[9, 'a'], pdf.loc[9, 'a']) - self.assert_eq(kdf.loc[5, ['a']], pdf.loc[5, ['a']]) - self.assert_eq(kdf.loc[9, ['a']], pdf.loc[9, ['a']]) + self.assert_eq(kdf.loc[5, "a"], pdf.loc[5, "a"]) + self.assert_eq(kdf.loc[9, "a"], pdf.loc[9, "a"]) + self.assert_eq(kdf.loc[5, ["a"]], pdf.loc[5, ["a"]]) + self.assert_eq(kdf.loc[9, ["a"]], pdf.loc[9, ["a"]]) def test_loc2d_multiindex_columns(self): - arrays = [np.array(['bar', 'bar', 'baz', 'baz']), - np.array(['one', 'two', 'one', 'two'])] + arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])] - pdf = pd.DataFrame(np.random.randn(3, 4), index=['A', 'B', 'C'], columns=arrays) + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.loc['B':'B', 'bar'], pdf.loc['B':'B', 'bar']) - self.assert_eq(kdf.loc['B':'B', ['bar']], pdf.loc['B':'B', ['bar']]) + self.assert_eq(kdf.loc["B":"B", "bar"], pdf.loc["B":"B", "bar"]) + self.assert_eq(kdf.loc["B":"B", ["bar"]], pdf.loc["B":"B", ["bar"]]) def test_loc2d_with_known_divisions(self): - pdf = pd.DataFrame(np.random.randn(20, 5), - index=list('abcdefghijklmnopqrst'), - columns=list('ABCDE')) + pdf = pd.DataFrame( + np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("ABCDE") + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.loc[['a'], 'A'], pdf.loc[['a'], 'A']) - self.assert_eq(kdf.loc[['a'], ['A']], pdf.loc[['a'], ['A']]) - self.assert_eq(kdf.loc['a':'o', 'A'], pdf.loc['a':'o', 'A']) - self.assert_eq(kdf.loc['a':'o', ['A']], pdf.loc['a':'o', ['A']]) - self.assert_eq(kdf.loc[['n'], ['A']], pdf.loc[['n'], ['A']]) - self.assert_eq(kdf.loc[['a', 'c', 'n'], ['A']], pdf.loc[['a', 'c', 'n'], ['A']]) + self.assert_eq(kdf.loc[["a"], "A"], pdf.loc[["a"], "A"]) + self.assert_eq(kdf.loc[["a"], ["A"]], pdf.loc[["a"], ["A"]]) + self.assert_eq(kdf.loc["a":"o", "A"], pdf.loc["a":"o", "A"]) + self.assert_eq(kdf.loc["a":"o", ["A"]], pdf.loc["a":"o", ["A"]]) + self.assert_eq(kdf.loc[["n"], ["A"]], pdf.loc[["n"], ["A"]]) + self.assert_eq(kdf.loc[["a", "c", "n"], ["A"]], pdf.loc[["a", "c", "n"], ["A"]]) # TODO?: self.assert_eq(kdf.loc[['t', 'b'], ['A']], pdf.loc[['t', 'b'], ['A']]) # TODO?: self.assert_eq(kdf.loc[['r', 'r', 'c', 'g', 'h'], ['A']], # TODO?: pdf.loc[['r', 'r', 'c', 'g', 'h'], ['A']]) - @unittest.skip('TODO: should handle duplicated columns properly') + @unittest.skip("TODO: should handle duplicated columns properly") def test_loc2d_duplicated_columns(self): - pdf = pd.DataFrame(np.random.randn(20, 5), - index=list('abcdefghijklmnopqrst'), - columns=list('AABCD')) + pdf = pd.DataFrame( + np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("AABCD") + ) kdf = ks.from_pandas(pdf) # TODO?: self.assert_eq(kdf.loc[['a'], 'A'], pdf.loc[['a'], 'A']) # TODO?: self.assert_eq(kdf.loc[['a'], ['A']], pdf.loc[['a'], ['A']]) - self.assert_eq(kdf.loc[['j'], 'B'], pdf.loc[['j'], 'B']) - self.assert_eq(kdf.loc[['j'], ['B']], pdf.loc[['j'], ['B']]) + self.assert_eq(kdf.loc[["j"], "B"], pdf.loc[["j"], "B"]) + self.assert_eq(kdf.loc[["j"], ["B"]], pdf.loc[["j"], ["B"]]) # TODO?: self.assert_eq(kdf.loc['a':'o', 'A'], pdf.loc['a':'o', 'A']) # TODO?: self.assert_eq(kdf.loc['a':'o', ['A']], pdf.loc['a':'o', ['A']]) - self.assert_eq(kdf.loc['j':'q', 'B'], pdf.loc['j':'q', 'B']) - self.assert_eq(kdf.loc['j':'q', ['B']], pdf.loc['j':'q', ['B']]) + self.assert_eq(kdf.loc["j":"q", "B"], pdf.loc["j":"q", "B"]) + self.assert_eq(kdf.loc["j":"q", ["B"]], pdf.loc["j":"q", ["B"]]) # TODO?: self.assert_eq(kdf.loc['a':'o', 'B':'D'], pdf.loc['a':'o', 'B':'D']) # TODO?: self.assert_eq(kdf.loc['a':'o', 'B':'D'], pdf.loc['a':'o', 'B':'D']) # TODO?: self.assert_eq(kdf.loc['j':'q', 'B':'A'], pdf.loc['j':'q', 'B':'A']) # TODO?: self.assert_eq(kdf.loc['j':'q', 'B':'A'], pdf.loc['j':'q', 'B':'A']) - self.assert_eq(kdf.loc[kdf.B > 0, 'B'], pdf.loc[pdf.B > 0, 'B']) + self.assert_eq(kdf.loc[kdf.B > 0, "B"], pdf.loc[pdf.B > 0, "B"]) # TODO?: self.assert_eq(kdf.loc[kdf.B > 0, ['A', 'C']], pdf.loc[pdf.B > 0, ['A', 'C']]) def test_getitem(self): - pdf = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9], - 'B': [9, 8, 7, 6, 5, 4, 3, 2, 1], - 'C': [True, False, True] * 3}, - columns=list('ABC')) + pdf = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], + "C": [True, False, True] * 3, + }, + columns=list("ABC"), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf['A'], pdf['A']) + self.assert_eq(kdf["A"], pdf["A"]) - self.assert_eq(kdf[['A', 'B']], pdf[['A', 'B']]) + self.assert_eq(kdf[["A", "B"]], pdf[["A", "B"]]) self.assert_eq(kdf[kdf.C], pdf[pdf.C]) - self.assertRaises(KeyError, lambda: kdf['X']) - self.assertRaises(KeyError, lambda: kdf[['A', 'X']]) + self.assertRaises(KeyError, lambda: kdf["X"]) + self.assertRaises(KeyError, lambda: kdf[["A", "X"]]) self.assertRaises(AttributeError, lambda: kdf.X) # not str/unicode @@ -509,152 +514,155 @@ def test_getitem(self): # TODO?: self.assertRaises(KeyError, lambda: pdf[[1, 8]]) def test_getitem_slice(self): - pdf = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9], - 'B': [9, 8, 7, 6, 5, 4, 3, 2, 1], - 'C': [True, False, True] * 3}, - index=list('abcdefghi')) + pdf = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], + "C": [True, False, True] * 3, + }, + index=list("abcdefghi"), + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf['a':'e'], pdf['a':'e']) - self.assert_eq(kdf['a':'b'], pdf['a':'b']) - self.assert_eq(kdf['f':], pdf['f':]) + self.assert_eq(kdf["a":"e"], pdf["a":"e"]) + self.assert_eq(kdf["a":"b"], pdf["a":"b"]) + self.assert_eq(kdf["f":], pdf["f":]) def test_loc_on_numpy_datetimes(self): - pdf = pd.DataFrame({'x': [1, 2, 3]}, - index=list(map(np.datetime64, ['2014', '2015', '2016']))) + pdf = pd.DataFrame( + {"x": [1, 2, 3]}, index=list(map(np.datetime64, ["2014", "2015", "2016"])) + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.loc['2014':'2015'], pdf.loc['2014':'2015']) + self.assert_eq(kdf.loc["2014":"2015"], pdf.loc["2014":"2015"]) def test_loc_on_pandas_datetimes(self): - pdf = pd.DataFrame({'x': [1, 2, 3]}, - index=list(map(pd.Timestamp, ['2014', '2015', '2016']))) + pdf = pd.DataFrame( + {"x": [1, 2, 3]}, index=list(map(pd.Timestamp, ["2014", "2015", "2016"])) + ) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.loc['2014':'2015'], pdf.loc['2014':'2015']) + self.assert_eq(kdf.loc["2014":"2015"], pdf.loc["2014":"2015"]) - @unittest.skip('TODO?: the behavior of slice for datetime') + @unittest.skip("TODO?: the behavior of slice for datetime") def test_loc_datetime_no_freq(self): - datetime_index = pd.date_range('2016-01-01', '2016-01-31', freq='12h') + datetime_index = pd.date_range("2016-01-01", "2016-01-31", freq="12h") datetime_index.freq = None # FORGET FREQUENCY - pdf = pd.DataFrame({'num': range(len(datetime_index))}, index=datetime_index) + pdf = pd.DataFrame({"num": range(len(datetime_index))}, index=datetime_index) kdf = ks.from_pandas(pdf) - slice_ = slice('2016-01-03', '2016-01-05') + slice_ = slice("2016-01-03", "2016-01-05") result = kdf.loc[slice_, :] expected = pdf.loc[slice_, :] self.assert_eq(result, expected) - @unittest.skip('TODO?: the behavior of slice for datetime') + @unittest.skip("TODO?: the behavior of slice for datetime") def test_loc_timestamp_str(self): - pdf = pd.DataFrame({'A': np.random.randn(100), 'B': np.random.randn(100)}, - index=pd.date_range('2011-01-01', freq='H', periods=100)) + pdf = pd.DataFrame( + {"A": np.random.randn(100), "B": np.random.randn(100)}, + index=pd.date_range("2011-01-01", freq="H", periods=100), + ) kdf = ks.from_pandas(pdf) # partial string slice # TODO?: self.assert_eq(pdf.loc['2011-01-02'], # TODO?: kdf.loc['2011-01-02']) - self.assert_eq(pdf.loc['2011-01-02':'2011-01-05'], - kdf.loc['2011-01-02':'2011-01-05']) + self.assert_eq(pdf.loc["2011-01-02":"2011-01-05"], kdf.loc["2011-01-02":"2011-01-05"]) # series # TODO?: self.assert_eq(pdf.A.loc['2011-01-02'], # TODO?: kdf.A.loc['2011-01-02']) - self.assert_eq(pdf.A.loc['2011-01-02':'2011-01-05'], - kdf.A.loc['2011-01-02':'2011-01-05']) + self.assert_eq(pdf.A.loc["2011-01-02":"2011-01-05"], kdf.A.loc["2011-01-02":"2011-01-05"]) - pdf = pd.DataFrame({'A': np.random.randn(100), 'B': np.random.randn(100)}, - index=pd.date_range('2011-01-01', freq='M', periods=100)) + pdf = pd.DataFrame( + {"A": np.random.randn(100), "B": np.random.randn(100)}, + index=pd.date_range("2011-01-01", freq="M", periods=100), + ) kdf = ks.from_pandas(pdf) # TODO?: self.assert_eq(pdf.loc['2011-01'], kdf.loc['2011-01']) # TODO?: self.assert_eq(pdf.loc['2011'], kdf.loc['2011']) - self.assert_eq(pdf.loc['2011-01':'2012-05'], kdf.loc['2011-01':'2012-05']) - self.assert_eq(pdf.loc['2011':'2015'], kdf.loc['2011':'2015']) + self.assert_eq(pdf.loc["2011-01":"2012-05"], kdf.loc["2011-01":"2012-05"]) + self.assert_eq(pdf.loc["2011":"2015"], kdf.loc["2011":"2015"]) # series # TODO?: self.assert_eq(pdf.B.loc['2011-01'], kdf.B.loc['2011-01']) # TODO?: self.assert_eq(pdf.B.loc['2011'], kdf.B.loc['2011']) - self.assert_eq(pdf.B.loc['2011-01':'2012-05'], kdf.B.loc['2011-01':'2012-05']) - self.assert_eq(pdf.B.loc['2011':'2015'], kdf.B.loc['2011':'2015']) + self.assert_eq(pdf.B.loc["2011-01":"2012-05"], kdf.B.loc["2011-01":"2012-05"]) + self.assert_eq(pdf.B.loc["2011":"2015"], kdf.B.loc["2011":"2015"]) - @unittest.skip('TODO?: the behavior of slice for datetime') + @unittest.skip("TODO?: the behavior of slice for datetime") def test_getitem_timestamp_str(self): - pdf = pd.DataFrame({'A': np.random.randn(100), 'B': np.random.randn(100)}, - index=pd.date_range('2011-01-01', freq='H', periods=100)) + pdf = pd.DataFrame( + {"A": np.random.randn(100), "B": np.random.randn(100)}, + index=pd.date_range("2011-01-01", freq="H", periods=100), + ) kdf = ks.from_pandas(pdf) # partial string slice # TODO?: self.assert_eq(pdf['2011-01-02'], # TODO?: kdf['2011-01-02']) - self.assert_eq(pdf['2011-01-02':'2011-01-05'], - kdf['2011-01-02':'2011-01-05']) + self.assert_eq(pdf["2011-01-02":"2011-01-05"], kdf["2011-01-02":"2011-01-05"]) - pdf = pd.DataFrame({'A': np.random.randn(100), 'B': np.random.randn(100)}, - index=pd.date_range('2011-01-01', freq='M', periods=100)) + pdf = pd.DataFrame( + {"A": np.random.randn(100), "B": np.random.randn(100)}, + index=pd.date_range("2011-01-01", freq="M", periods=100), + ) kdf = ks.from_pandas(pdf) # TODO?: self.assert_eq(pdf['2011-01'], kdf['2011-01']) # TODO?: self.assert_eq(pdf['2011'], kdf['2011']) - self.assert_eq(pdf['2011-01':'2012-05'], kdf['2011-01':'2012-05']) - self.assert_eq(pdf['2011':'2015'], kdf['2011':'2015']) + self.assert_eq(pdf["2011-01":"2012-05"], kdf["2011-01":"2012-05"]) + self.assert_eq(pdf["2011":"2015"], kdf["2011":"2015"]) - @unittest.skip('TODO?: period index can\'t convert to DataFrame correctly') + @unittest.skip("TODO?: period index can't convert to DataFrame correctly") def test_getitem_period_str(self): - pdf = pd.DataFrame({'A': np.random.randn(100), 'B': np.random.randn(100)}, - index=pd.period_range('2011-01-01', freq='H', periods=100)) + pdf = pd.DataFrame( + {"A": np.random.randn(100), "B": np.random.randn(100)}, + index=pd.period_range("2011-01-01", freq="H", periods=100), + ) kdf = ks.from_pandas(pdf) # partial string slice # TODO?: self.assert_eq(pdf['2011-01-02'], # TODO?: kdf['2011-01-02']) - self.assert_eq(pdf['2011-01-02':'2011-01-05'], - kdf['2011-01-02':'2011-01-05']) + self.assert_eq(pdf["2011-01-02":"2011-01-05"], kdf["2011-01-02":"2011-01-05"]) - pdf = pd.DataFrame({'A': np.random.randn(100), 'B': np.random.randn(100)}, - index=pd.period_range('2011-01-01', freq='M', periods=100)) + pdf = pd.DataFrame( + {"A": np.random.randn(100), "B": np.random.randn(100)}, + index=pd.period_range("2011-01-01", freq="M", periods=100), + ) kdf = ks.from_pandas(pdf) # TODO?: self.assert_eq(pdf['2011-01'], kdf['2011-01']) # TODO?: self.assert_eq(pdf['2011'], kdf['2011']) - self.assert_eq(pdf['2011-01':'2012-05'], kdf['2011-01':'2012-05']) - self.assert_eq(pdf['2011':'2015'], kdf['2011':'2015']) + self.assert_eq(pdf["2011-01":"2012-05"], kdf["2011-01":"2012-05"]) + self.assert_eq(pdf["2011":"2015"], kdf["2011":"2015"]) def test_iloc(self): pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.iloc[0, 0], pdf.iloc[0, 0]) - for indexer in [0, - [0], - [0, 1], - [1, 0], - [False, True, True], - slice(0, 1)]: + for indexer in [0, [0], [0, 1], [1, 0], [False, True, True], slice(0, 1)]: self.assert_eq(kdf.iloc[:, indexer], pdf.iloc[:, indexer]) self.assert_eq(kdf.iloc[:1, indexer], pdf.iloc[:1, indexer]) self.assert_eq(kdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer]) self.assert_eq(kdf.iloc[kdf.index == 2, indexer], pdf.iloc[pdf.index == 2, indexer]) def test_iloc_multiindex_columns(self): - arrays = [np.array(['bar', 'bar', 'baz', 'baz']), - np.array(['one', 'two', 'one', 'two'])] + arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])] - pdf = pd.DataFrame(np.random.randn(3, 4), index=['A', 'B', 'C'], columns=arrays) + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) kdf = ks.from_pandas(pdf) - for indexer in [0, - [0], - [0, 1], - [1, 0], - [False, True, True, True], - slice(0, 1)]: + for indexer in [0, [0], [0, 1], [1, 0], [False, True, True, True], slice(0, 1)]: self.assert_eq(kdf.iloc[:, indexer], pdf.iloc[:, indexer]) self.assert_eq(kdf.iloc[:1, indexer], pdf.iloc[:1, indexer]) self.assert_eq(kdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer]) - self.assert_eq(kdf.iloc[kdf.index == 'B', indexer], pdf.iloc[pdf.index == 'B', indexer]) + self.assert_eq(kdf.iloc[kdf.index == "B", indexer], pdf.iloc[pdf.index == "B", indexer]) def test_iloc_series(self): pseries = pd.Series([1, 2, 3]) @@ -666,84 +674,85 @@ def test_iloc_series(self): self.assert_eq(kseries.iloc[:-1], pseries.iloc[:-1]) def test_setitem(self): - pdf = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - index=['cobra', 'viper', 'sidewinder'], - columns=['max_speed', 'shield']) + pdf = pd.DataFrame( + [[1, 2], [4, 5], [7, 8]], + index=["cobra", "viper", "sidewinder"], + columns=["max_speed", "shield"], + ) kdf = ks.from_pandas(pdf) - pdf.loc[['viper', 'sidewinder'], ['shield', 'max_speed']] = 10 - kdf.loc[['viper', 'sidewinder'], ['shield', 'max_speed']] = 10 + pdf.loc[["viper", "sidewinder"], ["shield", "max_speed"]] = 10 + kdf.loc[["viper", "sidewinder"], ["shield", "max_speed"]] = 10 self.assert_eq(kdf, pdf) - pdf.loc[['viper', 'sidewinder'], 'shield'] = 50 - kdf.loc[['viper', 'sidewinder'], 'shield'] = 50 + pdf.loc[["viper", "sidewinder"], "shield"] = 50 + kdf.loc[["viper", "sidewinder"], "shield"] = 50 self.assert_eq(kdf, pdf) - with self.assertRaisesRegex(ValueError, - 'Only a dataframe with one column can be assigned'): - kdf.loc[:, 'max_speed'] = kdf - with self.assertRaisesRegex(ValueError, - 'only column names or list of column names can be assigned'): - kdf.loc[['viper'], ('max_speed', 'shield')] = 10 + with self.assertRaisesRegex(ValueError, "Only a dataframe with one column can be assigned"): + kdf.loc[:, "max_speed"] = kdf + with self.assertRaisesRegex( + ValueError, "only column names or list of column names can be assigned" + ): + kdf.loc[["viper"], ("max_speed", "shield")] = 10 msg = """Can only assign value to the whole dataframe, the row index has to be `slice(None)` or `:`""" - msg = ("Can only assign value to the whole dataframe, the row index") + msg = "Can only assign value to the whole dataframe, the row index" with self.assertRaisesRegex(SparkPandasNotImplementedError, msg): - kdf.loc['viper', 'max_speed'] = 10 + kdf.loc["viper", "max_speed"] = 10 - pdf = pd.DataFrame([[1], [4], [7]], - index=['cobra', 'viper', 'sidewinder'], - columns=['max_speed']) + pdf = pd.DataFrame( + [[1], [4], [7]], index=["cobra", "viper", "sidewinder"], columns=["max_speed"] + ) kdf = ks.from_pandas(pdf) - pdf.loc[:, 'max_speed'] = pdf - kdf.loc[:, 'max_speed'] = kdf + pdf.loc[:, "max_speed"] = pdf + kdf.loc[:, "max_speed"] = kdf self.assert_eq(kdf, pdf) def test_iloc_raises(self): pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) kdf = ks.from_pandas(pdf) - with self.assertRaisesRegex(SparkPandasNotImplementedError, - 'Cannot use start or step with Spark.'): + with self.assertRaisesRegex( + SparkPandasNotImplementedError, "Cannot use start or step with Spark." + ): kdf.iloc[0:] - with self.assertRaisesRegex(SparkPandasNotImplementedError, - 'Cannot use start or step with Spark.'): + with self.assertRaisesRegex( + SparkPandasNotImplementedError, "Cannot use start or step with Spark." + ): kdf.iloc[:2:2] - with self.assertRaisesRegex(SparkPandasNotImplementedError, - '.iloc requires numeric slice or conditional boolean Index'): + with self.assertRaisesRegex( + SparkPandasNotImplementedError, + ".iloc requires numeric slice or conditional boolean Index", + ): kdf.iloc[[0, 1], :] - with self.assertRaisesRegex(SparkPandasNotImplementedError, - '.iloc requires numeric slice or conditional boolean Index'): + with self.assertRaisesRegex( + SparkPandasNotImplementedError, + ".iloc requires numeric slice or conditional boolean Index", + ): kdf.A.iloc[[0, 1]] - with self.assertRaisesRegex(SparkPandasIndexingError, - 'Only accepts pairs of candidates'): + with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"): kdf.iloc[[0, 1], [0, 1], [1, 2]] - with self.assertRaisesRegex(SparkPandasIndexingError, - 'Too many indexers'): + with self.assertRaisesRegex(SparkPandasIndexingError, "Too many indexers"): kdf.A.iloc[[0, 1], [0, 1]] - with self.assertRaisesRegex(TypeError, - 'cannot do slice indexing with these indexers'): - kdf.iloc[:'b', :] + with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"): + kdf.iloc[:"b", :] - with self.assertRaisesRegex(TypeError, - 'cannot do slice indexing with these indexers'): - kdf.iloc[:, :'b'] + with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"): + kdf.iloc[:, :"b"] - with self.assertRaisesRegex(TypeError, - 'cannot perform reduce with flexible type'): - kdf.iloc[:, ['A']] + with self.assertRaisesRegex(TypeError, "cannot perform reduce with flexible type"): + kdf.iloc[:, ["A"]] - with self.assertRaisesRegex(ValueError, - 'Location based indexing can only have'): - kdf.iloc[:, 'A'] + with self.assertRaisesRegex(ValueError, "Location based indexing can only have"): + kdf.iloc[:, "A"] - with self.assertRaisesRegex(IndexError, - 'out of range'): + with self.assertRaisesRegex(IndexError, "out of range"): kdf.iloc[:, [5, 6]] diff --git a/databricks/koalas/tests/test_internal.py b/databricks/koalas/tests/test_internal.py index 04a2d58..aff97a9 100644 --- a/databricks/koalas/tests/test_internal.py +++ b/databricks/koalas/tests/test_internal.py @@ -21,43 +21,42 @@ class InternalFrameTest(ReusedSQLTestCase, SQLTestUtils): - def test_from_pandas(self): - pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) internal = _InternalFrame.from_pandas(pdf) sdf = internal.sdf self.assert_eq(internal.index_map, [(SPARK_DEFAULT_INDEX_NAME, None)]) - self.assert_eq(internal.column_labels, [('a', ), ('b', )]) - self.assert_eq(internal.data_columns, ['a', 'b']) - self.assertTrue(internal.scol_for(('a',))._jc.equals(sdf['a']._jc)) - self.assertTrue(internal.scol_for(('b',))._jc.equals(sdf['b']._jc)) + self.assert_eq(internal.column_labels, [("a",), ("b",)]) + self.assert_eq(internal.data_columns, ["a", "b"]) + self.assertTrue(internal.scol_for(("a",))._jc.equals(sdf["a"]._jc)) + self.assertTrue(internal.scol_for(("b",))._jc.equals(sdf["b"]._jc)) self.assert_eq(internal.pandas_df, pdf) # multi-index - pdf.set_index('a', append=True, inplace=True) + pdf.set_index("a", append=True, inplace=True) internal = _InternalFrame.from_pandas(pdf) sdf = internal.sdf - self.assert_eq(internal.index_map, [(SPARK_DEFAULT_INDEX_NAME, None), ('a', ('a',))]) - self.assert_eq(internal.column_labels, [('b', )]) - self.assert_eq(internal.data_columns, ['b']) - self.assertTrue(internal.scol_for(('b',))._jc.equals(sdf['b']._jc)) + self.assert_eq(internal.index_map, [(SPARK_DEFAULT_INDEX_NAME, None), ("a", ("a",))]) + self.assert_eq(internal.column_labels, [("b",)]) + self.assert_eq(internal.data_columns, ["b"]) + self.assertTrue(internal.scol_for(("b",))._jc.equals(sdf["b"]._jc)) self.assert_eq(internal.pandas_df, pdf) # multi-index columns - pdf.columns = pd.MultiIndex.from_tuples([('x', 'b')]) + pdf.columns = pd.MultiIndex.from_tuples([("x", "b")]) internal = _InternalFrame.from_pandas(pdf) sdf = internal.sdf - self.assert_eq(internal.index_map, [(SPARK_DEFAULT_INDEX_NAME, None), ('a', ('a',))]) - self.assert_eq(internal.column_labels, [('x', 'b')]) - self.assert_eq(internal.data_columns, ['(x, b)']) - self.assertTrue(internal.scol_for(('x', 'b'))._jc.equals(sdf['(x, b)']._jc)) + self.assert_eq(internal.index_map, [(SPARK_DEFAULT_INDEX_NAME, None), ("a", ("a",))]) + self.assert_eq(internal.column_labels, [("x", "b")]) + self.assert_eq(internal.data_columns, ["(x, b)"]) + self.assertTrue(internal.scol_for(("x", "b"))._jc.equals(sdf["(x, b)"]._jc)) self.assert_eq(internal.pandas_df, pdf) diff --git a/databricks/koalas/tests/test_namespace.py b/databricks/koalas/tests/test_namespace.py index b9c7f63..0cc9469 100644 --- a/databricks/koalas/tests/test_namespace.py +++ b/databricks/koalas/tests/test_namespace.py @@ -21,11 +21,8 @@ class NamespaceTest(ReusedSQLTestCase, SQLTestUtils): - def test_from_pandas(self): - pdf = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) + pdf = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf) @@ -40,89 +37,94 @@ def test_from_pandas(self): self.assert_eq(kidx, pidx) - pmidx = pdf.set_index('year', append=True).index + pmidx = pdf.set_index("year", append=True).index kmidx = ks.from_pandas(pmidx) self.assert_eq(kmidx, pmidx) - expected_error_message = 'Unknown data type: {}'.format(type(kidx)) + expected_error_message = "Unknown data type: {}".format(type(kidx)) with self.assertRaisesRegex(ValueError, expected_error_message): ks.from_pandas(kidx) def test_to_datetime(self): - pdf = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) + pdf = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) kdf = ks.from_pandas(pdf) dict_from_pdf = pdf.to_dict() self.assert_eq(pd.to_datetime(pdf), ks.to_datetime(kdf)) self.assert_eq(pd.to_datetime(dict_from_pdf), ks.to_datetime(dict_from_pdf)) - self.assert_eq(pd.to_datetime(1490195805, unit='s'), - ks.to_datetime(1490195805, unit='s')) - self.assert_eq(pd.to_datetime(1490195805433502912, unit='ns'), - ks.to_datetime(1490195805433502912, unit='ns')) + self.assert_eq(pd.to_datetime(1490195805, unit="s"), ks.to_datetime(1490195805, unit="s")) + self.assert_eq( + pd.to_datetime(1490195805433502912, unit="ns"), + ks.to_datetime(1490195805433502912, unit="ns"), + ) - self.assert_eq(pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')), - ks.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01'))) + self.assert_eq( + pd.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")), + ks.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")), + ) def test_concat(self): - pdf = pd.DataFrame({'A': [0, 2, 4], 'B': [1, 3, 5]}) + pdf = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}) kdf = ks.from_pandas(pdf) - self.assert_eq( - ks.concat([kdf, kdf.reset_index()]), - pd.concat([pdf, pdf.reset_index()])) + self.assert_eq(ks.concat([kdf, kdf.reset_index()]), pd.concat([pdf, pdf.reset_index()])) self.assert_eq( - ks.concat([kdf, kdf[['A']]], ignore_index=True), - pd.concat([pdf, pdf[['A']]], ignore_index=True)) + ks.concat([kdf, kdf[["A"]]], ignore_index=True), + pd.concat([pdf, pdf[["A"]]], ignore_index=True), + ) self.assert_eq( - ks.concat([kdf, kdf[['A']]], join="inner"), - pd.concat([pdf, pdf[['A']]], join="inner")) + ks.concat([kdf, kdf[["A"]]], join="inner"), pd.concat([pdf, pdf[["A"]]], join="inner") + ) self.assertRaisesRegex(TypeError, "first argument must be", lambda: ks.concat(kdf)) - self.assertRaisesRegex( - TypeError, "cannot concatenate object", lambda: ks.concat([kdf, 1])) + self.assertRaisesRegex(TypeError, "cannot concatenate object", lambda: ks.concat([kdf, 1])) - kdf2 = kdf.set_index('B', append=True) + kdf2 = kdf.set_index("B", append=True) self.assertRaisesRegex( - ValueError, "Index type and names should be same", lambda: ks.concat([kdf, kdf2])) + ValueError, "Index type and names should be same", lambda: ks.concat([kdf, kdf2]) + ) self.assertRaisesRegex(ValueError, "No objects to concatenate", lambda: ks.concat([])) - self.assertRaisesRegex( - ValueError, "All objects passed", lambda: ks.concat([None, None])) + self.assertRaisesRegex(ValueError, "All objects passed", lambda: ks.concat([None, None])) self.assertRaisesRegex( - NotImplementedError, 'axis should be either 0 or', - lambda: ks.concat([kdf, kdf], axis=1)) + NotImplementedError, "axis should be either 0 or", lambda: ks.concat([kdf, kdf], axis=1) + ) pdf3 = pdf.copy() kdf3 = kdf.copy() - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")]) pdf3.columns = columns kdf3.columns = columns - self.assert_eq(ks.concat([kdf3, kdf3.reset_index()]), - pd.concat([pdf3, pdf3.reset_index()])) + self.assert_eq(ks.concat([kdf3, kdf3.reset_index()]), pd.concat([pdf3, pdf3.reset_index()])) self.assert_eq( - ks.concat([kdf3, kdf3[[('X', 'A')]]], ignore_index=True), - pd.concat([pdf3, pdf3[[('X', 'A')]]], ignore_index=True)) + ks.concat([kdf3, kdf3[[("X", "A")]]], ignore_index=True), + pd.concat([pdf3, pdf3[[("X", "A")]]], ignore_index=True), + ) self.assert_eq( - ks.concat([kdf3, kdf3[[('X', 'A')]]], join="inner"), - pd.concat([pdf3, pdf3[[('X', 'A')]]], join="inner")) + ks.concat([kdf3, kdf3[[("X", "A")]]], join="inner"), + pd.concat([pdf3, pdf3[[("X", "A")]]], join="inner"), + ) - self.assertRaisesRegex(ValueError, "MultiIndex columns should have the same levels", - lambda: ks.concat([kdf, kdf3])) + self.assertRaisesRegex( + ValueError, + "MultiIndex columns should have the same levels", + lambda: ks.concat([kdf, kdf3]), + ) - pdf4 = pd.DataFrame({'A': [0, 2, 4], 'B': [1, 3, 5], 'C': [10, 20, 30]}) + pdf4 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5], "C": [10, 20, 30]}) kdf4 = ks.from_pandas(pdf4) self.assertRaisesRegex( - ValueError, r'Only can inner \(intersect\) or outer \(union\) join the other axis.', - lambda: ks.concat([kdf, kdf4], join='')) + ValueError, + r"Only can inner \(intersect\) or outer \(union\) join the other axis.", + lambda: ks.concat([kdf, kdf4], join=""), + ) diff --git a/databricks/koalas/tests/test_numpy_compat.py b/databricks/koalas/tests/test_numpy_compat.py index 427ea92..882ecbf 100644 --- a/databricks/koalas/tests/test_numpy_compat.py +++ b/databricks/koalas/tests/test_numpy_compat.py @@ -30,7 +30,6 @@ class NumPyCompatTest(ReusedSQLTestCase, SQLTestUtils): "isnat", "matmul", "frexp", - # Values are close enough but tests failed. "arccos", "exp", @@ -40,7 +39,6 @@ class NumPyCompatTest(ReusedSQLTestCase, SQLTestUtils): "log1p", # flaky "modf", "floor_divide", # flaky - # Results seem inconsistent in a different version of, I (Hyukjin) suspect, PyArrow. # From PyArrow 0.15, seems it returns the correct results via PySpark. Probably we # can enable it later when Koalas switches to PyArrow 0.15 completely. @@ -49,10 +47,10 @@ class NumPyCompatTest(ReusedSQLTestCase, SQLTestUtils): @property def pdf(self): - return pd.DataFrame({ - 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], - 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0], - }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0],}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) @property def kdf(self): @@ -85,9 +83,11 @@ def test_np_unsupported_frame(self): def test_np_spark_compat_series(self): # Use randomly generated dataFrame pdf = pd.DataFrame( - np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=['a', 'b']) + np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=["a", "b"] + ) pdf2 = pd.DataFrame( - np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=['a', 'b']) + np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=["a", "b"] + ) kdf = ks.from_pandas(pdf) kdf2 = ks.from_pandas(pdf2) @@ -105,16 +105,14 @@ def test_np_spark_compat_series(self): if np_name not in self.blacklist: try: # binary ufunc - self.assert_eq( - np_func(pdf.a, pdf.b), np_func(kdf.a, kdf.b), almost=True) - self.assert_eq( - np_func(pdf.a, 1), np_func(kdf.a, 1), almost=True) + self.assert_eq(np_func(pdf.a, pdf.b), np_func(kdf.a, kdf.b), almost=True) + self.assert_eq(np_func(pdf.a, 1), np_func(kdf.a, 1), almost=True) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time. try: - set_option('compute.ops_on_diff_frames', True) + set_option("compute.ops_on_diff_frames", True) for np_name, spark_func in list(binary_np_spark_mappings.items())[:5]: np_func = getattr(np, np_name) if np_name not in self.blacklist: @@ -122,18 +120,22 @@ def test_np_spark_compat_series(self): # binary ufunc self.assert_eq( np_func(pdf.a, pdf2.b).sort_index(), - np_func(kdf.a, kdf2.b).sort_index(), almost=True) + np_func(kdf.a, kdf2.b).sort_index(), + almost=True, + ) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e finally: - reset_option('compute.ops_on_diff_frames') + reset_option("compute.ops_on_diff_frames") def test_np_spark_compat_frame(self): # Use randomly generated dataFrame pdf = pd.DataFrame( - np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=['a', 'b']) + np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=["a", "b"] + ) pdf2 = pd.DataFrame( - np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=['a', 'b']) + np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=["a", "b"] + ) kdf = ks.from_pandas(pdf) kdf2 = ks.from_pandas(pdf2) @@ -151,16 +153,14 @@ def test_np_spark_compat_frame(self): if np_name not in self.blacklist: try: # binary ufunc - self.assert_eq( - np_func(pdf, pdf), np_func(kdf, kdf), almost=True) - self.assert_eq( - np_func(pdf, 1), np_func(kdf, 1), almost=True) + self.assert_eq(np_func(pdf, pdf), np_func(kdf, kdf), almost=True) + self.assert_eq(np_func(pdf, 1), np_func(kdf, 1), almost=True) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time. try: - set_option('compute.ops_on_diff_frames', True) + set_option("compute.ops_on_diff_frames", True) for np_name, spark_func in list(binary_np_spark_mappings.items())[:5]: np_func = getattr(np, np_name) if np_name not in self.blacklist: @@ -168,9 +168,11 @@ def test_np_spark_compat_frame(self): # binary ufunc self.assert_eq( np_func(pdf, pdf2).sort_index(), - np_func(kdf, kdf2).sort_index(), almost=True) + np_func(kdf, kdf2).sort_index(), + almost=True, + ) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e finally: - reset_option('compute.ops_on_diff_frames') + reset_option("compute.ops_on_diff_frames") diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py index a3eb8a7..21b2a5c 100644 --- a/databricks/koalas/tests/test_ops_on_diff_frames.py +++ b/databricks/koalas/tests/test_ops_on_diff_frames.py @@ -22,84 +22,89 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils): - @classmethod def setUpClass(cls): super(OpsOnDiffFramesEnabledTest, cls).setUpClass() - set_option('compute.ops_on_diff_frames', True) + set_option("compute.ops_on_diff_frames", True) @classmethod def tearDownClass(cls): - reset_option('compute.ops_on_diff_frames') + reset_option("compute.ops_on_diff_frames") super(OpsOnDiffFramesEnabledTest, cls).tearDownClass() @property def pdf1(self): - return pd.DataFrame({ - 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], - 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0], - }, index=[0, 1, 3, 5, 6, 8, 9, 10, 11]) + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0],}, + index=[0, 1, 3, 5, 6, 8, 9, 10, 11], + ) @property def pdf2(self): - return pd.DataFrame({ - 'a': [9, 8, 7, 6, 5, 4, 3, 2, 1], - 'b': [0, 0, 0, 4, 5, 6, 1, 2, 3], - }, index=list(range(9))) + return pd.DataFrame( + {"a": [9, 8, 7, 6, 5, 4, 3, 2, 1], "b": [0, 0, 0, 4, 5, 6, 1, 2, 3],}, + index=list(range(9)), + ) @property def pdf3(self): - return pd.DataFrame({ - 'b': [1, 1, 1, 1, 1, 1, 1, 1, 1], - 'c': [1, 1, 1, 1, 1, 1, 1, 1, 1], - }, index=list(range(9))) + return pd.DataFrame( + {"b": [1, 1, 1, 1, 1, 1, 1, 1, 1], "c": [1, 1, 1, 1, 1, 1, 1, 1, 1],}, + index=list(range(9)), + ) @property def pdf4(self): - return pd.DataFrame({ - 'e': [2, 2, 2, 2, 2, 2, 2, 2, 2], - 'f': [2, 2, 2, 2, 2, 2, 2, 2, 2], - }, index=list(range(9))) + return pd.DataFrame( + {"e": [2, 2, 2, 2, 2, 2, 2, 2, 2], "f": [2, 2, 2, 2, 2, 2, 2, 2, 2],}, + index=list(range(9)), + ) @property def pdf5(self): - return pd.DataFrame({ - 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], - 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0], - 'c': [4, 5, 6, 3, 2, 1, 0, 0, 0], - }, index=[0, 1, 3, 5, 6, 8, 9, 10, 11]).set_index(['a', 'b']) + return pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "b": [4, 5, 6, 3, 2, 1, 0, 0, 0], + "c": [4, 5, 6, 3, 2, 1, 0, 0, 0], + }, + index=[0, 1, 3, 5, 6, 8, 9, 10, 11], + ).set_index(["a", "b"]) @property def pdf6(self): - return pd.DataFrame({ - 'a': [9, 8, 7, 6, 5, 4, 3, 2, 1], - 'b': [0, 0, 0, 4, 5, 6, 1, 2, 3], - 'c': [9, 8, 7, 6, 5, 4, 3, 2, 1], - 'e': [4, 5, 6, 3, 2, 1, 0, 0, 0], - }, index=list(range(9))).set_index(['a', 'b']) + return pd.DataFrame( + { + "a": [9, 8, 7, 6, 5, 4, 3, 2, 1], + "b": [0, 0, 0, 4, 5, 6, 1, 2, 3], + "c": [9, 8, 7, 6, 5, 4, 3, 2, 1], + "e": [4, 5, 6, 3, 2, 1, 0, 0, 0], + }, + index=list(range(9)), + ).set_index(["a", "b"]) @property def pser1(self): - midx = pd.MultiIndex([['lama', 'cow', 'falcon', 'koala'], - ['speed', 'weight', 'length', 'power']], - [[0, 3, 1, 1, 1, 2, 2, 2], - [0, 2, 0, 3, 2, 0, 1, 3]]) + midx = pd.MultiIndex( + [["lama", "cow", "falcon", "koala"], ["speed", "weight", "length", "power"]], + [[0, 3, 1, 1, 1, 2, 2, 2], [0, 2, 0, 3, 2, 0, 1, 3]], + ) return pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1], index=midx) @property def pser2(self): - midx = pd.MultiIndex([['lama', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + midx = pd.MultiIndex( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) return pd.Series([-45, 200, -1.2, 30, -250, 1.5, 320, 1, -0.3], index=midx) @property def pser3(self): - midx = pd.MultiIndex([['koalas', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], - [1, 1, 2, 0, 0, 2, 2, 2, 1]]) + midx = pd.MultiIndex( + [["koalas", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [1, 1, 2, 0, 0, 2, 2, 2, 1]], + ) return pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) @property @@ -141,13 +146,16 @@ def kser3(self): def test_ranges(self): self.assert_eq( (ks.range(10) + ks.range(10)).sort_index(), - (ks.DataFrame({'id': list(range(10))}) - + ks.DataFrame({'id': list(range(10))})).sort_index()) + ( + ks.DataFrame({"id": list(range(10))}) + ks.DataFrame({"id": list(range(10))}) + ).sort_index(), + ) def test_no_matched_index(self): with self.assertRaisesRegex(ValueError, "Index names must be exactly matched"): - ks.DataFrame({'a': [1, 2, 3]}).set_index('a') + \ - ks.DataFrame({'b': [1, 2, 3]}).set_index('b') + ks.DataFrame({"a": [1, 2, 3]}).set_index("a") + ks.DataFrame( + {"b": [1, 2, 3]} + ).set_index("b") def test_arithmetic(self): kdf1 = self.kdf1 @@ -161,24 +169,24 @@ def test_arithmetic(self): # Series self.assert_eq( - (kdf1.a - kdf2.b).sort_index(), - (pdf1.a - pdf2.b).rename("a").sort_index(), almost=True) + (kdf1.a - kdf2.b).sort_index(), (pdf1.a - pdf2.b).rename("a").sort_index(), almost=True + ) self.assert_eq( - (kdf1.a * kdf2.a).sort_index(), - (pdf1.a * pdf2.a).rename("a").sort_index(), almost=True) + (kdf1.a * kdf2.a).sort_index(), (pdf1.a * pdf2.a).rename("a").sort_index(), almost=True + ) self.assert_eq( (kdf1["a"] / kdf2["a"]).sort_index(), - (pdf1["a"] / pdf2["a"]).rename("a").sort_index(), almost=True) + (pdf1["a"] / pdf2["a"]).rename("a").sort_index(), + almost=True, + ) # DataFrame - self.assert_eq( - (kdf1 + kdf2).sort_index(), - (pdf1 + pdf2).sort_index(), almost=True) + self.assert_eq((kdf1 + kdf2).sort_index(), (pdf1 + pdf2).sort_index(), almost=True) # Multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b")]) kdf1.columns = columns kdf2.columns = columns pdf1.columns = columns @@ -186,38 +194,34 @@ def test_arithmetic(self): # Series self.assert_eq( - (kdf1[('x', 'a')] - kdf2[('x', 'b')]).sort_index(), - (pdf1[('x', 'a')] - pdf2[('x', 'b')]).rename(('x', 'a')).sort_index(), almost=True) + (kdf1[("x", "a")] - kdf2[("x", "b")]).sort_index(), + (pdf1[("x", "a")] - pdf2[("x", "b")]).rename(("x", "a")).sort_index(), + almost=True, + ) self.assert_eq( - (kdf1[('x', 'a')] - kdf2['x']['b']).sort_index(), - (pdf1[('x', 'a')] - pdf2['x']['b']).rename(('x', 'a')).sort_index(), almost=True) + (kdf1[("x", "a")] - kdf2["x"]["b"]).sort_index(), + (pdf1[("x", "a")] - pdf2["x"]["b"]).rename(("x", "a")).sort_index(), + almost=True, + ) self.assert_eq( - (kdf1['x']['a'] - kdf2[('x', 'b')]).sort_index(), - (pdf1['x']['a'] - pdf2[('x', 'b')]).rename('a').sort_index(), almost=True) + (kdf1["x"]["a"] - kdf2[("x", "b")]).sort_index(), + (pdf1["x"]["a"] - pdf2[("x", "b")]).rename("a").sort_index(), + almost=True, + ) # DataFrame - self.assert_eq( - (kdf1 + kdf2).sort_index(), - (pdf1 + pdf2).sort_index(), almost=True) + self.assert_eq((kdf1 + kdf2).sort_index(), (pdf1 + pdf2).sort_index(), almost=True) # MultiIndex Series - self.assert_eq( - (kser1 + kser2).sort_index(), - (pser1 + pser2).sort_index(), almost=True) + self.assert_eq((kser1 + kser2).sort_index(), (pser1 + pser2).sort_index(), almost=True) - self.assert_eq( - (kser1 - kser2).sort_index(), - (pser1 - pser2).sort_index(), almost=True) + self.assert_eq((kser1 - kser2).sort_index(), (pser1 - pser2).sort_index(), almost=True) - self.assert_eq( - (kser1 * kser2).sort_index(), - (pser1 * pser2).sort_index(), almost=True) + self.assert_eq((kser1 * kser2).sort_index(), (pser1 * pser2).sort_index(), almost=True) - self.assert_eq( - (kser1 / kser2).sort_index(), - (pser1 / pser2).sort_index(), almost=True) + self.assert_eq((kser1 / kser2).sort_index(), (pser1 / pser2).sort_index(), almost=True) def test_arithmetic_chain(self): kdf1 = self.kdf1 @@ -236,100 +240,113 @@ def test_arithmetic_chain(self): # Series self.assert_eq( (kdf1.a - kdf2.b - kdf3.c).sort_index(), - (pdf1.a - pdf2.b - pdf3.c).rename("a").sort_index(), almost=True) + (pdf1.a - pdf2.b - pdf3.c).rename("a").sort_index(), + almost=True, + ) self.assert_eq( (kdf1.a * (kdf2.a * kdf3.c)).sort_index(), - (pdf1.a * (pdf2.a * pdf3.c)).rename("a").sort_index(), almost=True) + (pdf1.a * (pdf2.a * pdf3.c)).rename("a").sort_index(), + almost=True, + ) self.assert_eq( (kdf1["a"] / kdf2["a"] / kdf3["c"]).sort_index(), (pdf1["a"] / pdf2["a"] / pdf3["c"]).rename("a").sort_index(), - almost=True) + almost=True, + ) # DataFrame self.assert_eq( - (kdf1 + kdf2 - kdf3).sort_index(), - (pdf1 + pdf2 - pdf3).sort_index(), almost=True) + (kdf1 + kdf2 - kdf3).sort_index(), (pdf1 + pdf2 - pdf3).sort_index(), almost=True + ) # Multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b")]) kdf1.columns = columns kdf2.columns = columns pdf1.columns = columns pdf2.columns = columns - columns = pd.MultiIndex.from_tuples([('x', 'b'), ('y', 'c')]) + columns = pd.MultiIndex.from_tuples([("x", "b"), ("y", "c")]) kdf3.columns = columns pdf3.columns = columns # Series self.assert_eq( - (kdf1[('x', 'a')] - kdf2[('x', 'b')] - kdf3[('y', 'c')]).sort_index(), - (pdf1[('x', 'a')] - pdf2[('x', 'b')] - pdf3[('y', 'c')]).rename(('x', 'a')) + (kdf1[("x", "a")] - kdf2[("x", "b")] - kdf3[("y", "c")]).sort_index(), + (pdf1[("x", "a")] - pdf2[("x", "b")] - pdf3[("y", "c")]) + .rename(("x", "a")) .sort_index(), - almost=True) + almost=True, + ) self.assert_eq( - (kdf1[('x', 'a')] * (kdf2[('x', 'b')] * kdf3[('y', 'c')])).sort_index(), - (pdf1[('x', 'a')] * (pdf2[('x', 'b')] * pdf3[('y', 'c')])).rename(('x', 'a')) + (kdf1[("x", "a")] * (kdf2[("x", "b")] * kdf3[("y", "c")])).sort_index(), + (pdf1[("x", "a")] * (pdf2[("x", "b")] * pdf3[("y", "c")])) + .rename(("x", "a")) .sort_index(), - almost=True) + almost=True, + ) # DataFrame self.assert_eq( - (kdf1 + kdf2 - kdf3).sort_index(), - (pdf1 + pdf2 - pdf3).sort_index(), almost=True) + (kdf1 + kdf2 - kdf3).sort_index(), (pdf1 + pdf2 - pdf3).sort_index(), almost=True + ) # MultiIndex Series self.assert_eq( - (kser1 + kser2 - kser3).sort_index(), - (pser1 + pser2 - pser3).sort_index(), almost=True) + (kser1 + kser2 - kser3).sort_index(), (pser1 + pser2 - pser3).sort_index(), almost=True + ) self.assert_eq( - (kser1 * kser2 * kser3).sort_index(), - (pser1 * pser2 * pser3).sort_index(), almost=True) + (kser1 * kser2 * kser3).sort_index(), (pser1 * pser2 * pser3).sort_index(), almost=True + ) self.assert_eq( - (kser1 - kser2 / kser3).sort_index(), - (pser1 - pser2 / pser3).sort_index(), almost=True) + (kser1 - kser2 / kser3).sort_index(), (pser1 - pser2 / pser3).sort_index(), almost=True + ) self.assert_eq( - (kser1 + kser2 * kser3).sort_index(), - (pser1 + pser2 * pser3).sort_index(), almost=True) + (kser1 + kser2 * kser3).sort_index(), (pser1 + pser2 * pser3).sort_index(), almost=True + ) def test_getitem_boolean_series(self): - pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]}, - index=[20, 10, 30, 0, 50]) - pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]}, - index=[0, 30, 10, 20, 50]) + pdf1 = pd.DataFrame( + {"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]}, index=[20, 10, 30, 0, 50] + ) + pdf2 = pd.DataFrame( + {"A": [0, -1, -2, -3, -4], "B": [-100, -200, -300, -400, -500]}, + index=[0, 30, 10, 20, 50], + ) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - self.assert_eq(pdf1[pdf2.A > -3].sort_index(), - kdf1[kdf2.A > -3].sort_index()) + self.assert_eq(pdf1[pdf2.A > -3].sort_index(), kdf1[kdf2.A > -3].sort_index()) - self.assert_eq(pdf1.A[pdf2.A > -3].sort_index(), - kdf1.A[kdf2.A > -3].sort_index()) + self.assert_eq(pdf1.A[pdf2.A > -3].sort_index(), kdf1.A[kdf2.A > -3].sort_index()) - self.assert_eq((pdf1.A + 1)[pdf2.A > -3].sort_index(), - (kdf1.A + 1)[kdf2.A > -3].sort_index()) + self.assert_eq( + (pdf1.A + 1)[pdf2.A > -3].sort_index(), (kdf1.A + 1)[kdf2.A > -3].sort_index() + ) def test_loc_getitem_boolean_series(self): - pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]}, - index=[20, 10, 30, 0, 50]) - pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]}, - index=[20, 10, 30, 0, 50]) + pdf1 = pd.DataFrame( + {"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]}, index=[20, 10, 30, 0, 50] + ) + pdf2 = pd.DataFrame( + {"A": [0, -1, -2, -3, -4], "B": [-100, -200, -300, -400, -500]}, + index=[20, 10, 30, 0, 50], + ) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - self.assert_eq(pdf1.loc[pdf2.A > -3].sort_index(), - kdf1.loc[kdf2.A > -3].sort_index()) + self.assert_eq(pdf1.loc[pdf2.A > -3].sort_index(), kdf1.loc[kdf2.A > -3].sort_index()) - self.assert_eq(pdf1.A.loc[pdf2.A > -3].sort_index(), - kdf1.A.loc[kdf2.A > -3].sort_index()) + self.assert_eq(pdf1.A.loc[pdf2.A > -3].sort_index(), kdf1.A.loc[kdf2.A > -3].sort_index()) - self.assert_eq((pdf1.A + 1).loc[pdf2.A > -3].sort_index(), - (kdf1.A + 1).loc[kdf2.A > -3].sort_index()) + self.assert_eq( + (pdf1.A + 1).loc[pdf2.A > -3].sort_index(), (kdf1.A + 1).loc[kdf2.A > -3].sort_index() + ) def test_bitwise(self): pser1 = pd.Series([True, False, True, False, np.nan, np.nan, True, False, np.nan]) @@ -340,8 +357,8 @@ def test_bitwise(self): self.assert_eq(pser1 | pser2, (kser1 | kser2).sort_index()) self.assert_eq(pser1 & pser2, (kser1 & kser2).sort_index()) - pser1 = pd.Series([True, False, np.nan], index=list('ABC')) - pser2 = pd.Series([False, True, np.nan], index=list('DEF')) + pser1 = pd.Series([True, False, np.nan], index=list("ABC")) + pser2 = pd.Series([False, True, np.nan], index=list("DEF")) kser1 = ks.from_pandas(pser1) kser2 = ks.from_pandas(pser2) @@ -354,87 +371,83 @@ def test_different_columns(self): pdf1 = self.pdf1 pdf4 = self.pdf4 - self.assert_eq( - (kdf1 + kdf4).sort_index(), - (pdf1 + pdf4).sort_index(), almost=True) + self.assert_eq((kdf1 + kdf4).sort_index(), (pdf1 + pdf4).sort_index(), almost=True) # Multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b")]) kdf1.columns = columns pdf1.columns = columns - columns = pd.MultiIndex.from_tuples([('z', 'e'), ('z', 'f')]) + columns = pd.MultiIndex.from_tuples([("z", "e"), ("z", "f")]) kdf4.columns = columns pdf4.columns = columns - self.assert_eq( - (kdf1 + kdf4).sort_index(), - (pdf1 + pdf4).sort_index(), almost=True) + self.assert_eq((kdf1 + kdf4).sort_index(), (pdf1 + pdf4).sort_index(), almost=True) def test_assignment_series(self): kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 - kdf['a'] = self.kdf2.a - pdf['a'] = self.pdf2.a + kdf["a"] = self.kdf2.a + pdf["a"] = self.pdf2.a self.assert_eq(kdf.sort_index(), pdf.sort_index()) kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 - kdf['a'] = self.kdf2.b - pdf['a'] = self.pdf2.b + kdf["a"] = self.kdf2.b + pdf["a"] = self.pdf2.b self.assert_eq(kdf.sort_index(), pdf.sort_index()) kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 - kdf['c'] = self.kdf2.a - pdf['c'] = self.pdf2.a + kdf["c"] = self.kdf2.a + pdf["c"] = self.pdf2.a self.assert_eq(kdf.sort_index(), pdf.sort_index()) # Multi-index columns kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b")]) kdf.columns = columns pdf.columns = columns - kdf[('y', 'c')] = self.kdf2.a - pdf[('y', 'c')] = self.pdf2.a + kdf[("y", "c")] = self.kdf2.a + pdf[("y", "c")] = self.pdf2.a self.assert_eq(kdf.sort_index(), pdf.sort_index()) def test_assignment_frame(self): kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 - kdf[['a', 'b']] = self.kdf1 - pdf[['a', 'b']] = self.pdf1 + kdf[["a", "b"]] = self.kdf1 + pdf[["a", "b"]] = self.pdf1 self.assert_eq(kdf.sort_index(), pdf.sort_index()) # 'c' does not exist in `kdf`. kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 - kdf[['b', 'c']] = self.kdf1 - pdf[['b', 'c']] = self.pdf1 + kdf[["b", "c"]] = self.kdf1 + pdf[["b", "c"]] = self.pdf1 self.assert_eq(kdf.sort_index(), pdf.sort_index()) # 'c' and 'd' do not exist in `kdf`. kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 - kdf[['c', 'd']] = self.kdf1 - pdf[['c', 'd']] = self.pdf1 + kdf[["c", "d"]] = self.kdf1 + pdf[["c", "d"]] = self.pdf1 self.assert_eq(kdf.sort_index(), pdf.sort_index()) # Multi-index columns - columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b')]) + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b")]) kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 kdf.columns = columns pdf.columns = columns - kdf[[('y', 'c'), ('z', 'd')]] = self.kdf1 - pdf[[('y', 'c'), ('z', 'd')]] = self.pdf1 + kdf[[("y", "c"), ("z", "d")]] = self.kdf1 + pdf[[("y", "c"), ("z", "d")]] = self.pdf1 self.assert_eq(kdf.sort_index(), pdf.sort_index()) @@ -444,36 +457,36 @@ def test_assignment_frame(self): pdf1 = self.pdf1 kdf1.columns = columns pdf1.columns = columns - kdf[['c', 'd']] = kdf1 - pdf[['c', 'd']] = pdf1 + kdf[["c", "d"]] = kdf1 + pdf[["c", "d"]] = pdf1 self.assert_eq(kdf.sort_index(), pdf.sort_index()) def test_assignment_series_chain(self): kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 - kdf['a'] = self.kdf1.a - pdf['a'] = self.pdf1.a + kdf["a"] = self.kdf1.a + pdf["a"] = self.pdf1.a - kdf['a'] = self.kdf2.b - pdf['a'] = self.pdf2.b + kdf["a"] = self.kdf2.b + pdf["a"] = self.pdf2.b - kdf['d'] = self.kdf3.c - pdf['d'] = self.pdf3.c + kdf["d"] = self.kdf3.c + pdf["d"] = self.pdf3.c self.assert_eq(kdf.sort_index(), pdf.sort_index()) def test_assignment_frame_chain(self): kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 - kdf[['a', 'b']] = self.kdf1 - pdf[['a', 'b']] = self.pdf1 + kdf[["a", "b"]] = self.kdf1 + pdf[["a", "b"]] = self.pdf1 - kdf[['e', 'f']] = self.kdf3 - pdf[['e', 'f']] = self.pdf3 + kdf[["e", "f"]] = self.kdf3 + pdf[["e", "f"]] = self.pdf3 - kdf[['b', 'c']] = self.kdf2 - pdf[['b', 'c']] = self.pdf2 + kdf[["b", "c"]] = self.kdf2 + pdf[["b", "c"]] = self.pdf2 self.assert_eq(kdf.sort_index(), pdf.sort_index()) @@ -485,184 +498,180 @@ def test_multi_index_arithmetic(self): # Series self.assert_eq( - (kdf5.c - kdf6.e).sort_index(), - (pdf5.c - pdf6.e).rename("c").sort_index(), almost=True) + (kdf5.c - kdf6.e).sort_index(), (pdf5.c - pdf6.e).rename("c").sort_index(), almost=True + ) self.assert_eq( (kdf5["c"] / kdf6["e"]).sort_index(), - (pdf5["c"] / pdf6["e"]).rename("c").sort_index(), almost=True) + (pdf5["c"] / pdf6["e"]).rename("c").sort_index(), + almost=True, + ) # DataFrame - self.assert_eq( - (kdf5 + kdf6).sort_index(), - (pdf5 + pdf6).sort_index(), almost=True) + self.assert_eq((kdf5 + kdf6).sort_index(), (pdf5 + pdf6).sort_index(), almost=True) def test_multi_index_assignment_series(self): kdf = ks.from_pandas(self.pdf5) pdf = self.pdf5 - kdf['x'] = self.kdf6.e - pdf['x'] = self.pdf6.e + kdf["x"] = self.kdf6.e + pdf["x"] = self.pdf6.e self.assert_eq(kdf.sort_index(), pdf.sort_index()) kdf = ks.from_pandas(self.pdf5) pdf = self.pdf5 - kdf['e'] = self.kdf6.e - pdf['e'] = self.pdf6.e + kdf["e"] = self.kdf6.e + pdf["e"] = self.pdf6.e self.assert_eq(kdf.sort_index(), pdf.sort_index()) kdf = ks.from_pandas(self.pdf5) pdf = self.pdf5 - kdf['c'] = self.kdf6.e - pdf['c'] = self.pdf6.e + kdf["c"] = self.kdf6.e + pdf["c"] = self.pdf6.e self.assert_eq(kdf.sort_index(), pdf.sort_index()) def test_multi_index_assignment_frame(self): kdf = ks.from_pandas(self.pdf5) pdf = self.pdf5 - kdf[['c']] = self.kdf5 - pdf[['c']] = self.pdf5 + kdf[["c"]] = self.kdf5 + pdf[["c"]] = self.pdf5 self.assert_eq(kdf.sort_index(), pdf.sort_index()) kdf = ks.from_pandas(self.pdf5) pdf = self.pdf5 - kdf[['x']] = self.kdf5 - pdf[['x']] = self.pdf5 + kdf[["x"]] = self.kdf5 + pdf[["x"]] = self.pdf5 self.assert_eq(kdf.sort_index(), pdf.sort_index()) kdf = ks.from_pandas(self.pdf6) pdf = self.pdf6 - kdf[['x', 'y']] = self.kdf6 - pdf[['x', 'y']] = self.pdf6 + kdf[["x", "y"]] = self.kdf6 + pdf[["x", "y"]] = self.pdf6 self.assert_eq(kdf.sort_index(), pdf.sort_index()) def test_loc_setitem(self): pdf = pd.DataFrame( [[1, 2], [4, 5], [7, 8]], - index=['cobra', 'viper', 'sidewinder'], - columns=['max_speed', 'shield']) + index=["cobra", "viper", "sidewinder"], + columns=["max_speed", "shield"], + ) kdf = ks.DataFrame(pdf) another_kdf = ks.DataFrame(pdf) - kdf.loc[['viper', 'sidewinder'], ['shield']] = another_kdf.max_speed - pdf.loc[['viper', 'sidewinder'], ['shield']] = pdf.max_speed + kdf.loc[["viper", "sidewinder"], ["shield"]] = another_kdf.max_speed + pdf.loc[["viper", "sidewinder"], ["shield"]] = pdf.max_speed self.assert_eq(kdf.sort_index(), pdf.sort_index()) def test_where(self): - pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]}) - pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]}) + pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]}) + pdf2 = pd.DataFrame({"A": [0, -1, -2, -3, -4], "B": [-100, -200, -300, -400, -500]}) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - self.assert_eq(repr(pdf1.where(pdf2 > 100)), - repr(kdf1.where(kdf2 > 100).sort_index())) + self.assert_eq(repr(pdf1.where(pdf2 > 100)), repr(kdf1.where(kdf2 > 100).sort_index())) - pdf1 = pd.DataFrame({'A': [-1, -2, -3, -4, -5], 'B': [-100, -200, -300, -400, -500]}) - pdf2 = pd.DataFrame({'A': [-10, -20, -30, -40, -50], 'B': [-5, -4, -3, -2, -1]}) + pdf1 = pd.DataFrame({"A": [-1, -2, -3, -4, -5], "B": [-100, -200, -300, -400, -500]}) + pdf2 = pd.DataFrame({"A": [-10, -20, -30, -40, -50], "B": [-5, -4, -3, -2, -1]}) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - self.assert_eq(repr(pdf1.where(pdf2 < -250)), - repr(kdf1.where(kdf2 < -250).sort_index())) + self.assert_eq(repr(pdf1.where(pdf2 < -250)), repr(kdf1.where(kdf2 < -250).sort_index())) # multi-index columns - pdf1 = pd.DataFrame({('X', 'A'): [0, 1, 2, 3, 4], - ('X', 'B'): [100, 200, 300, 400, 500]}) - pdf2 = pd.DataFrame({('X', 'A'): [0, -1, -2, -3, -4], - ('X', 'B'): [-100, -200, -300, -400, -500]}) + pdf1 = pd.DataFrame({("X", "A"): [0, 1, 2, 3, 4], ("X", "B"): [100, 200, 300, 400, 500]}) + pdf2 = pd.DataFrame( + {("X", "A"): [0, -1, -2, -3, -4], ("X", "B"): [-100, -200, -300, -400, -500]} + ) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - self.assert_eq(repr(pdf1.where(pdf2 > 100)), - repr(kdf1.where(kdf2 > 100).sort_index())) + self.assert_eq(repr(pdf1.where(pdf2 > 100)), repr(kdf1.where(kdf2 > 100).sort_index())) def test_mask(self): - pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]}) - pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]}) + pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]}) + pdf2 = pd.DataFrame({"A": [0, -1, -2, -3, -4], "B": [-100, -200, -300, -400, -500]}) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - self.assert_eq(repr(pdf1.mask(pdf2 < 100)), - repr(kdf1.mask(kdf2 < 100).sort_index())) + self.assert_eq(repr(pdf1.mask(pdf2 < 100)), repr(kdf1.mask(kdf2 < 100).sort_index())) - pdf1 = pd.DataFrame({'A': [-1, -2, -3, -4, -5], 'B': [-100, -200, -300, -400, -500]}) - pdf2 = pd.DataFrame({'A': [-10, -20, -30, -40, -50], 'B': [-5, -4, -3, -2, -1]}) + pdf1 = pd.DataFrame({"A": [-1, -2, -3, -4, -5], "B": [-100, -200, -300, -400, -500]}) + pdf2 = pd.DataFrame({"A": [-10, -20, -30, -40, -50], "B": [-5, -4, -3, -2, -1]}) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - self.assert_eq(repr(pdf1.mask(pdf2 > -250)), - repr(kdf1.mask(kdf2 > -250).sort_index())) + self.assert_eq(repr(pdf1.mask(pdf2 > -250)), repr(kdf1.mask(kdf2 > -250).sort_index())) # multi-index columns - pdf1 = pd.DataFrame({('X', 'A'): [0, 1, 2, 3, 4], - ('X', 'B'): [100, 200, 300, 400, 500]}) - pdf2 = pd.DataFrame({('X', 'A'): [0, -1, -2, -3, -4], - ('X', 'B'): [-100, -200, -300, -400, -500]}) + pdf1 = pd.DataFrame({("X", "A"): [0, 1, 2, 3, 4], ("X", "B"): [100, 200, 300, 400, 500]}) + pdf2 = pd.DataFrame( + {("X", "A"): [0, -1, -2, -3, -4], ("X", "B"): [-100, -200, -300, -400, -500]} + ) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) - self.assert_eq(repr(pdf1.mask(pdf2 < 100)), - repr(kdf1.mask(kdf2 < 100).sort_index())) + self.assert_eq(repr(pdf1.mask(pdf2 < 100)), repr(kdf1.mask(kdf2 < 100).sort_index())) def test_multi_index_column_assignment_frame(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]}) - pdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y')]) + pdf = pd.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) + pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) kdf = ks.DataFrame(pdf) - kdf['c'] = ks.Series([10, 20, 30, 20]) - pdf['c'] = pd.Series([10, 20, 30, 20]) + kdf["c"] = ks.Series([10, 20, 30, 20]) + pdf["c"] = pd.Series([10, 20, 30, 20]) - kdf[('d', 'x')] = ks.Series([100, 200, 300, 200], name='1') - pdf[('d', 'x')] = pd.Series([100, 200, 300, 200], name='1') + kdf[("d", "x")] = ks.Series([100, 200, 300, 200], name="1") + pdf[("d", "x")] = pd.Series([100, 200, 300, 200], name="1") - kdf[('d', 'y')] = ks.Series([1000, 2000, 3000, 2000], name=('1', '2')) - pdf[('d', 'y')] = pd.Series([1000, 2000, 3000, 2000], name=('1', '2')) + kdf[("d", "y")] = ks.Series([1000, 2000, 3000, 2000], name=("1", "2")) + pdf[("d", "y")] = pd.Series([1000, 2000, 3000, 2000], name=("1", "2")) - kdf['e'] = ks.Series([10000, 20000, 30000, 20000], name=('1', '2', '3')) - pdf['e'] = pd.Series([10000, 20000, 30000, 20000], name=('1', '2', '3')) + kdf["e"] = ks.Series([10000, 20000, 30000, 20000], name=("1", "2", "3")) + pdf["e"] = pd.Series([10000, 20000, 30000, 20000], name=("1", "2", "3")) - kdf[[('f', 'x'), ('f', 'y')]] = ks.DataFrame({'1': [100000, 200000, 300000, 200000], - '2': [1000000, 2000000, 3000000, 2000000]}) - pdf[[('f', 'x'), ('f', 'y')]] = pd.DataFrame({'1': [100000, 200000, 300000, 200000], - '2': [1000000, 2000000, 3000000, 2000000]}) + kdf[[("f", "x"), ("f", "y")]] = ks.DataFrame( + {"1": [100000, 200000, 300000, 200000], "2": [1000000, 2000000, 3000000, 2000000]} + ) + pdf[[("f", "x"), ("f", "y")]] = pd.DataFrame( + {"1": [100000, 200000, 300000, 200000], "2": [1000000, 2000000, 3000000, 2000000]} + ) self.assert_eq(repr(kdf.sort_index()), repr(pdf)) - with self.assertRaisesRegex(KeyError, 'Key length \\(3\\) exceeds index depth \\(2\\)'): - kdf[('1', '2', '3')] = ks.Series([100, 200, 300, 200]) + with self.assertRaisesRegex(KeyError, "Key length \\(3\\) exceeds index depth \\(2\\)"): + kdf[("1", "2", "3")] = ks.Series([100, 200, 300, 200]) class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils): - @classmethod def setUpClass(cls): super(OpsOnDiffFramesDisabledTest, cls).setUpClass() - set_option('compute.ops_on_diff_frames', False) + set_option("compute.ops_on_diff_frames", False) @classmethod def tearDownClass(cls): - reset_option('compute.ops_on_diff_frames') + reset_option("compute.ops_on_diff_frames") super(OpsOnDiffFramesDisabledTest, cls).tearDownClass() @property def pdf1(self): - return pd.DataFrame({ - 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], - 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0], - }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0],}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) @property def pdf2(self): - return pd.DataFrame({ - 'a': [9, 8, 7, 6, 5, 4, 3, 2, 1], - 'b': [0, 0, 0, 4, 5, 6, 1, 2, 3], - }, index=list(range(9))) + return pd.DataFrame( + {"a": [9, 8, 7, 6, 5, 4, 3, 2, 1], "b": [0, 0, 0, 4, 5, 6, 1, 2, 3],}, + index=list(range(9)), + ) @property def kdf1(self): @@ -688,53 +697,52 @@ def test_arithmetic(self): def test_assignment(self): with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): kdf = ks.from_pandas(self.pdf1) - kdf['c'] = self.kdf1.a + kdf["c"] = self.kdf1.a def test_loc_setitem(self): pdf = pd.DataFrame( [[1, 2], [4, 5], [7, 8]], - index=['cobra', 'viper', 'sidewinder'], - columns=['max_speed', 'shield']) + index=["cobra", "viper", "sidewinder"], + columns=["max_speed", "shield"], + ) kdf = ks.DataFrame(pdf) another_kdf = ks.DataFrame(pdf) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - kdf.loc[['viper', 'sidewinder'], ['shield']] = another_kdf.max_speed + kdf.loc[["viper", "sidewinder"], ["shield"]] = another_kdf.max_speed def test_where(self): - pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]}) - pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]}) + pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]}) + pdf2 = pd.DataFrame({"A": [0, -1, -2, -3, -4], "B": [-100, -200, -300, -400, -500]}) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - self.assert_eq(repr(pdf1.where(pdf2 > 100)), - repr(kdf1.where(kdf2 > 100).sort_index())) + self.assert_eq(repr(pdf1.where(pdf2 > 100)), repr(kdf1.where(kdf2 > 100).sort_index())) - pdf1 = pd.DataFrame({'A': [-1, -2, -3, -4, -5], 'B': [-100, -200, -300, -400, -500]}) - pdf2 = pd.DataFrame({'A': [-10, -20, -30, -40, -50], 'B': [-5, -4, -3, -2, -1]}) + pdf1 = pd.DataFrame({"A": [-1, -2, -3, -4, -5], "B": [-100, -200, -300, -400, -500]}) + pdf2 = pd.DataFrame({"A": [-10, -20, -30, -40, -50], "B": [-5, -4, -3, -2, -1]}) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - self.assert_eq(repr(pdf1.where(pdf2 < -250)), - repr(kdf1.where(kdf2 < -250).sort_index())) + self.assert_eq( + repr(pdf1.where(pdf2 < -250)), repr(kdf1.where(kdf2 < -250).sort_index()) + ) def test_mask(self): - pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]}) - pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]}) + pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]}) + pdf2 = pd.DataFrame({"A": [0, -1, -2, -3, -4], "B": [-100, -200, -300, -400, -500]}) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - self.assert_eq(repr(pdf1.mask(pdf2 < 100)), - repr(kdf1.mask(kdf2 < 100).sort_index())) + self.assert_eq(repr(pdf1.mask(pdf2 < 100)), repr(kdf1.mask(kdf2 < 100).sort_index())) - pdf1 = pd.DataFrame({'A': [-1, -2, -3, -4, -5], 'B': [-100, -200, -300, -400, -500]}) - pdf2 = pd.DataFrame({'A': [-10, -20, -30, -40, -50], 'B': [-5, -4, -3, -2, -1]}) + pdf1 = pd.DataFrame({"A": [-1, -2, -3, -4, -5], "B": [-100, -200, -300, -400, -500]}) + pdf2 = pd.DataFrame({"A": [-10, -20, -30, -40, -50], "B": [-5, -4, -3, -2, -1]}) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - self.assert_eq(repr(pdf1.mask(pdf2 > -250)), - repr(kdf1.mask(kdf2 > -250).sort_index())) + self.assert_eq(repr(pdf1.mask(pdf2 > -250)), repr(kdf1.mask(kdf2 > -250).sort_index())) diff --git a/databricks/koalas/tests/test_repr.py b/databricks/koalas/tests/test_repr.py index 319f66b..c21afb8 100644 --- a/databricks/koalas/tests/test_repr.py +++ b/databricks/koalas/tests/test_repr.py @@ -85,8 +85,10 @@ def test_html_repr(self): self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_()) def test_repr_float_index(self): - kdf = ks.DataFrame({'a': np.random.rand(ReprTest.max_display_count)}, - index=np.random.rand(ReprTest.max_display_count)) + kdf = ks.DataFrame( + {"a": np.random.rand(ReprTest.max_display_count)}, + index=np.random.rand(ReprTest.max_display_count), + ) self.assertTrue("Showing only the first" not in repr(kdf)) self.assert_eq(repr(kdf), repr(kdf.to_pandas())) self.assertTrue("Showing only the first" not in repr(kdf.a)) @@ -97,8 +99,10 @@ def test_repr_float_index(self): self.assertTrue("Showing only the first" not in kdf._repr_html_()) self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_()) - kdf = ks.DataFrame({'a': np.random.rand(ReprTest.max_display_count + 1)}, - index=np.random.rand(ReprTest.max_display_count + 1)) + kdf = ks.DataFrame( + {"a": np.random.rand(ReprTest.max_display_count + 1)}, + index=np.random.rand(ReprTest.max_display_count + 1), + ) self.assertTrue("Showing only the first" in repr(kdf)) self.assertTrue("Showing only the first" in repr(kdf.a)) self.assertTrue("Showing only the first" in repr(kdf.index)) diff --git a/databricks/koalas/tests/test_reshape.py b/databricks/koalas/tests/test_reshape.py index 9a715f2..7becd11 100644 --- a/databricks/koalas/tests/test_reshape.py +++ b/databricks/koalas/tests/test_reshape.py @@ -26,48 +26,70 @@ class ReshapeTest(ReusedSQLTestCase): - def test_get_dummies(self): - for pdf_or_ps in [pd.Series([1, 1, 1, 2, 2, 1, 3, 4]), - # pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category'), - # pd.Series(pd.Categorical([1, 1, 1, 2, 2, 1, 3, 4], - # categories=[4, 3, 2, 1])), - pd.DataFrame({'a': [1, 2, 3, 4, 4, 3, 2, 1], - # 'b': pd.Categorical(list('abcdabcd')), - 'b': list('abcdabcd')})]: + for pdf_or_ps in [ + pd.Series([1, 1, 1, 2, 2, 1, 3, 4]), + # pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category'), + # pd.Series(pd.Categorical([1, 1, 1, 2, 2, 1, 3, 4], + # categories=[4, 3, 2, 1])), + pd.DataFrame( + { + "a": [1, 2, 3, 4, 4, 3, 2, 1], + # 'b': pd.Categorical(list('abcdabcd')), + "b": list("abcdabcd"), + } + ), + ]: kdf_or_kser = ks.from_pandas(pdf_or_ps) self.assert_eq(ks.get_dummies(kdf_or_kser), pd.get_dummies(pdf_or_ps), almost=True) kser = ks.Series([1, 1, 1, 2, 2, 1, 3, 4]) with self.assertRaisesRegex( - NotImplementedError, 'get_dummies currently does not support sparse'): + NotImplementedError, "get_dummies currently does not support sparse" + ): ks.get_dummies(kser, sparse=True) def test_get_dummies_object(self): - pdf = pd.DataFrame({'a': [1, 2, 3, 4, 4, 3, 2, 1], - # 'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]), - 'b': list('abcdabcd'), - # 'c': pd.Categorical(list('abcdabcd')), - 'c': list('abcdabcd')}) + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 4, 3, 2, 1], + # 'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]), + "b": list("abcdabcd"), + # 'c': pd.Categorical(list('abcdabcd')), + "c": list("abcdabcd"), + } + ) kdf = ks.from_pandas(pdf) # Explicitly exclude object columns - self.assert_eq(ks.get_dummies(kdf, columns=['a', 'c']), - pd.get_dummies(pdf, columns=['a', 'c']), almost=True) + self.assert_eq( + ks.get_dummies(kdf, columns=["a", "c"]), + pd.get_dummies(pdf, columns=["a", "c"]), + almost=True, + ) self.assert_eq(ks.get_dummies(kdf), pd.get_dummies(pdf), almost=True) self.assert_eq(ks.get_dummies(kdf.b), pd.get_dummies(pdf.b), almost=True) - self.assert_eq(ks.get_dummies(kdf, columns=['b']), - pd.get_dummies(pdf, columns=['b']), almost=True) + self.assert_eq( + ks.get_dummies(kdf, columns=["b"]), pd.get_dummies(pdf, columns=["b"]), almost=True + ) def test_get_dummies_date_datetime(self): - pdf = pd.DataFrame({'d': [datetime.date(2019, 1, 1), - datetime.date(2019, 1, 2), - datetime.date(2019, 1, 1)], - 'dt': [datetime.datetime(2019, 1, 1, 0, 0, 0), - datetime.datetime(2019, 1, 1, 0, 0, 1), - datetime.datetime(2019, 1, 1, 0, 0, 0)]}) + pdf = pd.DataFrame( + { + "d": [ + datetime.date(2019, 1, 1), + datetime.date(2019, 1, 2), + datetime.date(2019, 1, 1), + ], + "dt": [ + datetime.datetime(2019, 1, 1, 0, 0, 0), + datetime.datetime(2019, 1, 1, 0, 0, 1), + datetime.datetime(2019, 1, 1, 0, 0, 0), + ], + } + ) kdf = ks.from_pandas(pdf) self.assert_eq(ks.get_dummies(kdf), pd.get_dummies(pdf), almost=True) @@ -75,14 +97,14 @@ def test_get_dummies_date_datetime(self): self.assert_eq(ks.get_dummies(kdf.dt), pd.get_dummies(pdf.dt), almost=True) def test_get_dummies_boolean(self): - pdf = pd.DataFrame({'b': [True, False, True]}) + pdf = pd.DataFrame({"b": [True, False, True]}) kdf = ks.from_pandas(pdf) self.assert_eq(ks.get_dummies(kdf), pd.get_dummies(pdf), almost=True) self.assert_eq(ks.get_dummies(kdf.b), pd.get_dummies(pdf.b), almost=True) def test_get_dummies_decimal(self): - pdf = pd.DataFrame({'d': [Decimal(1.0), Decimal(2.0), Decimal(1)]}) + pdf = pd.DataFrame({"d": [Decimal(1.0), Decimal(2.0), Decimal(1)]}) kdf = ks.from_pandas(pdf) self.assert_eq(ks.get_dummies(kdf), pd.get_dummies(pdf), almost=True) @@ -92,11 +114,17 @@ def test_get_dummies_kwargs(self): # pser = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category') pser = pd.Series([1, 1, 1, 2, 2, 1, 3, 4]) kser = ks.from_pandas(pser) - self.assert_eq(ks.get_dummies(kser, prefix='X', prefix_sep='-'), - pd.get_dummies(pser, prefix='X', prefix_sep='-'), almost=True) - - self.assert_eq(ks.get_dummies(kser, drop_first=True), - pd.get_dummies(pser, drop_first=True), almost=True) + self.assert_eq( + ks.get_dummies(kser, prefix="X", prefix_sep="-"), + pd.get_dummies(pser, prefix="X", prefix_sep="-"), + almost=True, + ) + + self.assert_eq( + ks.get_dummies(kser, drop_first=True), + pd.get_dummies(pser, drop_first=True), + almost=True, + ) # nan # pser = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category') @@ -105,73 +133,94 @@ def test_get_dummies_kwargs(self): self.assert_eq(ks.get_dummies(kser), pd.get_dummies(pser), almost=True) # dummy_na - self.assert_eq(ks.get_dummies(kser, dummy_na=True), - pd.get_dummies(pser, dummy_na=True), almost=True) + self.assert_eq( + ks.get_dummies(kser, dummy_na=True), pd.get_dummies(pser, dummy_na=True), almost=True + ) def test_get_dummies_prefix(self): - pdf = pd.DataFrame({ - "A": ['a', 'b', 'a'], - "B": ['b', 'a', 'c'], - "D": [0, 0, 1], - }) + pdf = pd.DataFrame({"A": ["a", "b", "a"], "B": ["b", "a", "c"], "D": [0, 0, 1],}) kdf = ks.from_pandas(pdf) - self.assert_eq(ks.get_dummies(kdf, prefix=['foo', 'bar']), - pd.get_dummies(pdf, prefix=['foo', 'bar']), almost=True) + self.assert_eq( + ks.get_dummies(kdf, prefix=["foo", "bar"]), + pd.get_dummies(pdf, prefix=["foo", "bar"]), + almost=True, + ) - self.assert_eq(ks.get_dummies(kdf, prefix=['foo'], columns=['B']), - pd.get_dummies(pdf, prefix=['foo'], columns=['B']), almost=True) + self.assert_eq( + ks.get_dummies(kdf, prefix=["foo"], columns=["B"]), + pd.get_dummies(pdf, prefix=["foo"], columns=["B"]), + almost=True, + ) with self.assertRaisesRegex(NotImplementedError, "string types"): - ks.get_dummies(kdf, prefix='foo') + ks.get_dummies(kdf, prefix="foo") with self.assertRaisesRegex(ValueError, "Length of 'prefix' \\(1\\) .* \\(2\\)"): - ks.get_dummies(kdf, prefix=['foo']) + ks.get_dummies(kdf, prefix=["foo"]) with self.assertRaisesRegex(ValueError, "Length of 'prefix' \\(2\\) .* \\(1\\)"): - ks.get_dummies(kdf, prefix=['foo', 'bar'], columns=['B']) + ks.get_dummies(kdf, prefix=["foo", "bar"], columns=["B"]) - pser = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], name='A') + pser = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], name="A") kser = ks.from_pandas(pser) - self.assert_eq(ks.get_dummies(kser, prefix='foo'), - pd.get_dummies(pser, prefix='foo'), almost=True) + self.assert_eq( + ks.get_dummies(kser, prefix="foo"), pd.get_dummies(pser, prefix="foo"), almost=True + ) # columns are ignored. - self.assert_eq(ks.get_dummies(kser, prefix=['foo'], columns=['B']), - pd.get_dummies(pser, prefix=['foo'], columns=['B']), almost=True) + self.assert_eq( + ks.get_dummies(kser, prefix=["foo"], columns=["B"]), + pd.get_dummies(pser, prefix=["foo"], columns=["B"]), + almost=True, + ) def test_get_dummies_dtype(self): - pdf = pd.DataFrame({ - # "A": pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']), - "A": ['a', 'b', 'a'], - "B": [0, 0, 1], - }) + pdf = pd.DataFrame( + { + # "A": pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']), + "A": ["a", "b", "a"], + "B": [0, 0, 1], + } + ) kdf = ks.from_pandas(pdf) if LooseVersion("0.23.0") <= LooseVersion(pd.__version__): - exp = pd.get_dummies(pdf, dtype='float64') + exp = pd.get_dummies(pdf, dtype="float64") else: exp = pd.get_dummies(pdf) - exp = exp.astype({'A_a': 'float64', 'A_b': 'float64'}) - res = ks.get_dummies(kdf, dtype='float64') + exp = exp.astype({"A_a": "float64", "A_b": "float64"}) + res = ks.get_dummies(kdf, dtype="float64") self.assert_eq(res, exp, almost=True) def test_get_dummies_multiindex_columns(self): - pdf = pd.DataFrame({('x', 'a', '1'): [1, 2, 3, 4, 4, 3, 2, 1], - ('x', 'b', '2'): list('abcdabcd'), - ('y', 'c', '3'): list('abcdabcd')}) + pdf = pd.DataFrame( + { + ("x", "a", "1"): [1, 2, 3, 4, 4, 3, 2, 1], + ("x", "b", "2"): list("abcdabcd"), + ("y", "c", "3"): list("abcdabcd"), + } + ) kdf = ks.from_pandas(pdf) self.assert_eq(ks.get_dummies(kdf), pd.get_dummies(pdf), almost=True) - self.assert_eq(ks.get_dummies(kdf, columns=[('y', 'c', '3'), ('x', 'a', '1')]), - pd.get_dummies(pdf, columns=[('y', 'c', '3'), ('x', 'a', '1')]), almost=True) - self.assert_eq(ks.get_dummies(kdf, columns=['x']), - pd.get_dummies(pdf, columns=['x']), almost=True) - self.assert_eq(ks.get_dummies(kdf, columns=('x', 'a')), - pd.get_dummies(pdf, columns=('x', 'a')), almost=True) - self.assert_eq(ks.get_dummies(kdf, columns=['x']), - pd.get_dummies(pdf, columns=['x']), almost=True) - - self.assertRaises(KeyError, lambda: ks.get_dummies(kdf, columns=['z'])) - self.assertRaises(KeyError, lambda: ks.get_dummies(kdf, columns=('x', 'c'))) - self.assertRaises(ValueError, lambda: ks.get_dummies(kdf, columns=[('x',), 'c'])) - self.assertRaises(TypeError, lambda: ks.get_dummies(kdf, columns='x')) + self.assert_eq( + ks.get_dummies(kdf, columns=[("y", "c", "3"), ("x", "a", "1")]), + pd.get_dummies(pdf, columns=[("y", "c", "3"), ("x", "a", "1")]), + almost=True, + ) + self.assert_eq( + ks.get_dummies(kdf, columns=["x"]), pd.get_dummies(pdf, columns=["x"]), almost=True + ) + self.assert_eq( + ks.get_dummies(kdf, columns=("x", "a")), + pd.get_dummies(pdf, columns=("x", "a")), + almost=True, + ) + self.assert_eq( + ks.get_dummies(kdf, columns=["x"]), pd.get_dummies(pdf, columns=["x"]), almost=True + ) + + self.assertRaises(KeyError, lambda: ks.get_dummies(kdf, columns=["z"])) + self.assertRaises(KeyError, lambda: ks.get_dummies(kdf, columns=("x", "c"))) + self.assertRaises(ValueError, lambda: ks.get_dummies(kdf, columns=[("x",), "c"])) + self.assertRaises(TypeError, lambda: ks.get_dummies(kdf, columns="x")) diff --git a/databricks/koalas/tests/test_rolling.py b/databricks/koalas/tests/test_rolling.py index 2d17303..18e35d4 100644 --- a/databricks/koalas/tests/test_rolling.py +++ b/databricks/koalas/tests/test_rolling.py @@ -23,7 +23,6 @@ class RollingTest(ReusedSQLTestCase, TestUtils): - def test_rolling_error(self): with self.assertRaisesRegex(ValueError, "window must be >= 0"): ks.range(10).rolling(window=-1) @@ -31,8 +30,8 @@ def test_rolling_error(self): ks.range(10).rolling(window=1, min_periods=-1) with self.assertRaisesRegex( - TypeError, - "kdf_or_kser must be a series or dataframe; however, got:.*int"): + TypeError, "kdf_or_kser must be a series or dataframe; however, got:.*int" + ): Rolling(1, 2) def _test_rolling_func(self, f): @@ -40,27 +39,24 @@ def _test_rolling_func(self, f): pser = kser.to_pandas() self.assert_eq(repr(getattr(kser.rolling(2), f)()), repr(getattr(pser.rolling(2), f)())) - kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]}, - index=np.random.rand(4)) + kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4)) pdf = kdf.to_pandas() self.assert_eq(repr(getattr(kdf.rolling(2), f)()), repr(getattr(pdf.rolling(2), f)())) # Multiindex kser = ks.Series( - [1, 2, 3], - index=pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])) + [1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) + ) pser = kser.to_pandas() self.assert_eq(repr(getattr(kser.rolling(2), f)()), repr(getattr(pser.rolling(2), f)())) - kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]}, - index=np.random.rand(4)) + kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4)) pdf = kdf.to_pandas() self.assert_eq(repr(getattr(kdf.rolling(2), f)()), repr(getattr(pdf.rolling(2), f)())) # Multiindex column - kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]}, - index=np.random.rand(4)) - kdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y')]) + kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4)) + kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) pdf = kdf.to_pandas() self.assert_eq(repr(getattr(kdf.rolling(2), f)()), repr(getattr(pdf.rolling(2), f)())) @@ -90,34 +86,39 @@ def _test_groupby_rolling_func(self, f): pser = kser.to_pandas() self.assert_eq( repr(getattr(kser.groupby(kser).rolling(2), f)().sort_index()), - repr(getattr(pser.groupby(pser).rolling(2), f)().sort_index())) + repr(getattr(pser.groupby(pser).rolling(2), f)().sort_index()), + ) # Multiindex kser = ks.Series( - [1, 2, 3], - index=pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])) + [1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) + ) pser = kser.to_pandas() self.assert_eq( repr(getattr(kser.groupby(kser).rolling(2), f)().sort_index()), - repr(getattr(pser.groupby(pser).rolling(2), f)())) + repr(getattr(pser.groupby(pser).rolling(2), f)()), + ) - kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]}) + kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) pdf = kdf.to_pandas() self.assert_eq( repr(getattr(kdf.groupby(kdf.a).rolling(2), f)().sort_index()), - repr(getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index())) + repr(getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index()), + ) # Multiindex column - kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]}) - kdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y')]) + kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) + kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) pdf = kdf.to_pandas() self.assert_eq( repr(getattr(kdf.groupby(("a", "x")).rolling(2), f)().sort_index()), - repr(getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index())) + repr(getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index()), + ) self.assert_eq( repr(getattr(kdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index()), - repr(getattr(pdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index())) + repr(getattr(pdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index()), + ) def test_groupby_rolling_count(self): self._test_groupby_rolling_func("count") diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py index e8da43a..29ae28c 100644 --- a/databricks/koalas/tests/test_series.py +++ b/databricks/koalas/tests/test_series.py @@ -22,7 +22,8 @@ from datetime import datetime, timedelta import matplotlib -matplotlib.use('agg') + +matplotlib.use("agg") from matplotlib import pyplot as plt import pyspark import numpy as np @@ -37,10 +38,9 @@ class SeriesTest(ReusedSQLTestCase, SQLTestUtils): - @property def pser(self): - return pd.Series([1, 2, 3, 4, 5, 6, 7], name='x') + return pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") @property def kser(self): @@ -55,46 +55,46 @@ def test_series(self): def test_series_tuple_name(self): pser = self.pser - pser.name = ('x', 'a') + pser.name = ("x", "a") kser = ks.from_pandas(pser) self.assert_eq(kser, pser) self.assert_eq(kser.name, pser.name) - pser.name = ('y', 'z') - kser.name = ('y', 'z') + pser.name = ("y", "z") + kser.name = ("y", "z") self.assert_eq(kser, pser) self.assert_eq(kser.name, pser.name) def test_repr_cache_invalidation(self): # If there is any cache, inplace operations should invalidate it. - s = ks.range(10)['id'] + s = ks.range(10)["id"] s.__repr__() - s.rename('a', inplace=True) + s.rename("a", inplace=True) self.assertEqual(s.__repr__(), s.rename("a").__repr__()) def test_empty_series(self): - a = pd.Series([], dtype='i1') - b = pd.Series([], dtype='str') + a = pd.Series([], dtype="i1") + b = pd.Series([], dtype="str") self.assert_eq(ks.from_pandas(a), a) self.assertRaises(ValueError, lambda: ks.from_pandas(b)) - with self.sql_conf({'spark.sql.execution.arrow.enabled': False}): + with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): self.assert_eq(ks.from_pandas(a), a) self.assertRaises(ValueError, lambda: ks.from_pandas(b)) def test_all_null_series(self): - a = pd.Series([None, None, None], dtype='float64') - b = pd.Series([None, None, None], dtype='str') + a = pd.Series([None, None, None], dtype="float64") + b = pd.Series([None, None, None], dtype="str") self.assert_eq(ks.from_pandas(a).dtype, a.dtype) self.assertTrue(ks.from_pandas(a).toPandas().isnull().all()) self.assertRaises(ValueError, lambda: ks.from_pandas(b)) - with self.sql_conf({'spark.sql.execution.arrow.enabled': False}): + with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): self.assert_eq(ks.from_pandas(a).dtype, a.dtype) self.assertTrue(ks.from_pandas(a).toPandas().isnull().all()) self.assertRaises(ValueError, lambda: ks.from_pandas(b)) @@ -108,12 +108,12 @@ def test_head_tail(self): # TODO: self.assert_eq(kser.tail(3), pser.tail(3)) def test_rename(self): - pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name='x') + pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") kser = ks.from_pandas(pser) - pser.name = 'renamed' - kser.name = 'renamed' - self.assertEqual(kser.name, 'renamed') + pser.name = "renamed" + kser.name = "renamed" + self.assertEqual(kser.name, "renamed") self.assert_eq(kser, pser) pser.name = None @@ -123,23 +123,23 @@ def test_rename(self): pidx = pser.index kidx = kser.index - pidx.name = 'renamed' - kidx.name = 'renamed' - self.assertEqual(kidx.name, 'renamed') + pidx.name = "renamed" + kidx.name = "renamed" + self.assertEqual(kidx.name, "renamed") self.assert_eq(kidx, pidx) def test_rename_method(self): # Series name - pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name='x') + pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") kser = ks.from_pandas(pser) - self.assert_eq(kser.rename('y'), pser.rename('y')) - self.assertEqual(kser.name, 'x') # no mutation + self.assert_eq(kser.rename("y"), pser.rename("y")) + self.assertEqual(kser.name, "x") # no mutation self.assert_eq(kser.rename(), pser.rename()) - kser.rename('z', inplace=True) - pser.rename('z', inplace=True) - self.assertEqual(kser.name, 'z') + kser.rename("z", inplace=True) + pser.rename("z", inplace=True) + self.assertEqual(kser.name, "z") self.assert_eq(kser, pser) # Series index @@ -163,51 +163,55 @@ def test_rename_method(self): def test_values_property(self): kser = self.kser - msg = ("Koalas does not support the 'values' property. If you want to collect your data " + - "as an NumPy array, use 'to_numpy()' instead.") + msg = ( + "Koalas does not support the 'values' property. If you want to collect your data " + + "as an NumPy array, use 'to_numpy()' instead." + ) with self.assertRaises(NotImplementedError, msg=msg): kser.values def test_or(self): - pdf = pd.DataFrame({ - 'left': [True, False, True, False, np.nan, np.nan, True, False, np.nan], - 'right': [True, False, False, True, True, False, np.nan, np.nan, np.nan] - }) + pdf = pd.DataFrame( + { + "left": [True, False, True, False, np.nan, np.nan, True, False, np.nan], + "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], + } + ) kdf = ks.from_pandas(pdf) - self.assert_eq(pdf['left'] | pdf['right'], - kdf['left'] | kdf['right']) + self.assert_eq(pdf["left"] | pdf["right"], kdf["left"] | kdf["right"]) def test_and(self): - pdf = pd.DataFrame({ - 'left': [True, False, True, False, np.nan, np.nan, True, False, np.nan], - 'right': [True, False, False, True, True, False, np.nan, np.nan, np.nan] - }) + pdf = pd.DataFrame( + { + "left": [True, False, True, False, np.nan, np.nan, True, False, np.nan], + "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], + } + ) kdf = ks.from_pandas(pdf) - self.assert_eq(pdf['left'] & pdf['right'], - kdf['left'] & kdf['right']) + self.assert_eq(pdf["left"] & pdf["right"], kdf["left"] & kdf["right"]) def test_to_numpy(self): - pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name='x') + pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") kser = ks.from_pandas(pser) np.testing.assert_equal(kser.to_numpy(), pser.values) def test_isin(self): - pser = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], name='animal') + pser = pd.Series(["lama", "cow", "lama", "beetle", "lama", "hippo"], name="animal") kser = ks.from_pandas(pser) - self.assert_eq(kser.isin(['cow', 'lama']), pser.isin(['cow', 'lama'])) - self.assert_eq(kser.isin({'cow'}), pser.isin({'cow'})) + self.assert_eq(kser.isin(["cow", "lama"]), pser.isin(["cow", "lama"])) + self.assert_eq(kser.isin({"cow"}), pser.isin({"cow"})) msg = "only list-like objects are allowed to be passed to isin()" with self.assertRaisesRegex(TypeError, msg): kser.isin(1) def test_fillna(self): - pser = pd.Series([np.nan, 2, 3, 4, np.nan, 6], name='x') + pser = pd.Series([np.nan, 2, 3, 4, np.nan, 6], name="x") kser = ks.from_pandas(pser) self.assert_eq(kser.fillna(0), pser.fillna(0)) @@ -217,7 +221,7 @@ def test_fillna(self): self.assert_eq(kser, pser) def test_dropna(self): - pser = pd.Series([np.nan, 2, 3, 4, np.nan, 6], name='x') + pser = pd.Series([np.nan, 2, 3, 4, np.nan, 6], name="x") kser = ks.from_pandas(pser) @@ -254,147 +258,270 @@ def _test_value_counts(self): self.assertEqual(res.name, exp.name) self.assert_eq(res, exp, almost=True) - self.assert_eq(kser.value_counts(normalize=True), - pser.value_counts(normalize=True), almost=True) - self.assert_eq(kser.value_counts(ascending=True), - pser.value_counts(ascending=True), almost=True) - self.assert_eq(kser.value_counts(normalize=True, dropna=False), - pser.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.value_counts(ascending=True, dropna=False), - pser.value_counts(ascending=True, dropna=False), almost=True) - - self.assert_eq(kser.index.value_counts(normalize=True), - pser.index.value_counts(normalize=True), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True), - pser.index.value_counts(ascending=True), almost=True) - self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), - pser.index.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), - pser.index.value_counts(ascending=True, dropna=False), almost=True) - - with self.assertRaisesRegex(NotImplementedError, - "value_counts currently does not support bins"): + self.assert_eq( + kser.value_counts(normalize=True), pser.value_counts(normalize=True), almost=True + ) + self.assert_eq( + kser.value_counts(ascending=True), pser.value_counts(ascending=True), almost=True + ) + self.assert_eq( + kser.value_counts(normalize=True, dropna=False), + pser.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.value_counts(ascending=True, dropna=False), + pser.value_counts(ascending=True, dropna=False), + almost=True, + ) + + self.assert_eq( + kser.index.value_counts(normalize=True), + pser.index.value_counts(normalize=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True), + pser.index.value_counts(ascending=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(normalize=True, dropna=False), + pser.index.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True, dropna=False), + pser.index.value_counts(ascending=True, dropna=False), + almost=True, + ) + + with self.assertRaisesRegex( + NotImplementedError, "value_counts currently does not support bins" + ): kser.value_counts(bins=3) - pser.name = 'index' - kser.name = 'index' + pser.name = "index" + kser.name = "index" self.assert_eq(kser.value_counts(), pser.value_counts(), almost=True) # Series from DataFrame - pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [None, 1, None]}) + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [None, 1, None]}) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf.a.value_counts(normalize=True), - pdf.a.value_counts(normalize=True), almost=True) - self.assert_eq(kdf.a.value_counts(ascending=True), - pdf.a.value_counts(ascending=True), almost=True) - self.assert_eq(kdf.a.value_counts(normalize=True, dropna=False), - pdf.a.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kdf.a.value_counts(ascending=True, dropna=False), - pdf.a.value_counts(ascending=True, dropna=False), almost=True) - - self.assert_eq(kser.index.value_counts(normalize=True), - pser.index.value_counts(normalize=True), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True), - pser.index.value_counts(ascending=True), almost=True) - self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), - pser.index.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), - pser.index.value_counts(ascending=True, dropna=False), almost=True) + self.assert_eq( + kdf.a.value_counts(normalize=True), pdf.a.value_counts(normalize=True), almost=True + ) + self.assert_eq( + kdf.a.value_counts(ascending=True), pdf.a.value_counts(ascending=True), almost=True + ) + self.assert_eq( + kdf.a.value_counts(normalize=True, dropna=False), + pdf.a.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kdf.a.value_counts(ascending=True, dropna=False), + pdf.a.value_counts(ascending=True, dropna=False), + almost=True, + ) + + self.assert_eq( + kser.index.value_counts(normalize=True), + pser.index.value_counts(normalize=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True), + pser.index.value_counts(ascending=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(normalize=True, dropna=False), + pser.index.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True, dropna=False), + pser.index.value_counts(ascending=True, dropna=False), + almost=True, + ) # Series with NaN index pser = pd.Series([1, 2, 3], index=[2, None, 5]) kser = ks.from_pandas(pser) - self.assert_eq(kser.value_counts(normalize=True), - pser.value_counts(normalize=True), almost=True) - self.assert_eq(kser.value_counts(ascending=True), - pser.value_counts(ascending=True), almost=True) - self.assert_eq(kser.value_counts(normalize=True, dropna=False), - pser.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.value_counts(ascending=True, dropna=False), - pser.value_counts(ascending=True, dropna=False), almost=True) - - self.assert_eq(kser.index.value_counts(normalize=True), - pser.index.value_counts(normalize=True), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True), - pser.index.value_counts(ascending=True), almost=True) - self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), - pser.index.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), - pser.index.value_counts(ascending=True, dropna=False), almost=True) + self.assert_eq( + kser.value_counts(normalize=True), pser.value_counts(normalize=True), almost=True + ) + self.assert_eq( + kser.value_counts(ascending=True), pser.value_counts(ascending=True), almost=True + ) + self.assert_eq( + kser.value_counts(normalize=True, dropna=False), + pser.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.value_counts(ascending=True, dropna=False), + pser.value_counts(ascending=True, dropna=False), + almost=True, + ) + + self.assert_eq( + kser.index.value_counts(normalize=True), + pser.index.value_counts(normalize=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True), + pser.index.value_counts(ascending=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(normalize=True, dropna=False), + pser.index.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True, dropna=False), + pser.index.value_counts(ascending=True, dropna=False), + almost=True, + ) # Series with MultiIndex - pser.index = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) + pser.index = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) kser = ks.from_pandas(pser) - self.assert_eq(kser.value_counts(normalize=True), - pser.value_counts(normalize=True), almost=True) - self.assert_eq(kser.value_counts(ascending=True), - pser.value_counts(ascending=True), almost=True) - self.assert_eq(kser.value_counts(normalize=True, dropna=False), - pser.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.value_counts(ascending=True, dropna=False), - pser.value_counts(ascending=True, dropna=False), almost=True) - - self.assert_eq(kser.index.value_counts(normalize=True), - pser.index.value_counts(normalize=True), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True), - pser.index.value_counts(ascending=True), almost=True) - self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), - pser.index.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), - pser.index.value_counts(ascending=True, dropna=False), almost=True) + self.assert_eq( + kser.value_counts(normalize=True), pser.value_counts(normalize=True), almost=True + ) + self.assert_eq( + kser.value_counts(ascending=True), pser.value_counts(ascending=True), almost=True + ) + self.assert_eq( + kser.value_counts(normalize=True, dropna=False), + pser.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.value_counts(ascending=True, dropna=False), + pser.value_counts(ascending=True, dropna=False), + almost=True, + ) + + self.assert_eq( + kser.index.value_counts(normalize=True), + pser.index.value_counts(normalize=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True), + pser.index.value_counts(ascending=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(normalize=True, dropna=False), + pser.index.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True, dropna=False), + pser.index.value_counts(ascending=True, dropna=False), + almost=True, + ) # Series with MultiIndex some of index has NaN - pser.index = pd.MultiIndex.from_tuples([('x', 'a'), ('x', None), ('y', 'c')]) + pser.index = pd.MultiIndex.from_tuples([("x", "a"), ("x", None), ("y", "c")]) kser = ks.from_pandas(pser) - self.assert_eq(kser.value_counts(normalize=True), - pser.value_counts(normalize=True), almost=True) - self.assert_eq(kser.value_counts(ascending=True), - pser.value_counts(ascending=True), almost=True) - self.assert_eq(kser.value_counts(normalize=True, dropna=False), - pser.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.value_counts(ascending=True, dropna=False), - pser.value_counts(ascending=True, dropna=False), almost=True) - - self.assert_eq(kser.index.value_counts(normalize=True), - pser.index.value_counts(normalize=True), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True), - pser.index.value_counts(ascending=True), almost=True) - self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), - pser.index.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), - pser.index.value_counts(ascending=True, dropna=False), almost=True) + self.assert_eq( + kser.value_counts(normalize=True), pser.value_counts(normalize=True), almost=True + ) + self.assert_eq( + kser.value_counts(ascending=True), pser.value_counts(ascending=True), almost=True + ) + self.assert_eq( + kser.value_counts(normalize=True, dropna=False), + pser.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.value_counts(ascending=True, dropna=False), + pser.value_counts(ascending=True, dropna=False), + almost=True, + ) + + self.assert_eq( + kser.index.value_counts(normalize=True), + pser.index.value_counts(normalize=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True), + pser.index.value_counts(ascending=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(normalize=True, dropna=False), + pser.index.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True, dropna=False), + pser.index.value_counts(ascending=True, dropna=False), + almost=True, + ) # Series with MultiIndex some of index is NaN. # This test only available for pandas >= 0.24. if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - pser.index = pd.MultiIndex.from_tuples([('x', 'a'), None, ('y', 'c')]) + pser.index = pd.MultiIndex.from_tuples([("x", "a"), None, ("y", "c")]) kser = ks.from_pandas(pser) - self.assert_eq(kser.value_counts(normalize=True), - pser.value_counts(normalize=True), almost=True) - self.assert_eq(kser.value_counts(ascending=True), - pser.value_counts(ascending=True), almost=True) - self.assert_eq(kser.value_counts(normalize=True, dropna=False), - pser.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.value_counts(ascending=True, dropna=False), - pser.value_counts(ascending=True, dropna=False), almost=True) - - self.assert_eq(kser.index.value_counts(normalize=True), - pser.index.value_counts(normalize=True), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True), - pser.index.value_counts(ascending=True), almost=True) - self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), - pser.index.value_counts(normalize=True, dropna=False), almost=True) - self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), - pser.index.value_counts(ascending=True, dropna=False), almost=True) + self.assert_eq( + kser.value_counts(normalize=True), pser.value_counts(normalize=True), almost=True + ) + self.assert_eq( + kser.value_counts(ascending=True), pser.value_counts(ascending=True), almost=True + ) + self.assert_eq( + kser.value_counts(normalize=True, dropna=False), + pser.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.value_counts(ascending=True, dropna=False), + pser.value_counts(ascending=True, dropna=False), + almost=True, + ) + + self.assert_eq( + kser.index.value_counts(normalize=True), + pser.index.value_counts(normalize=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True), + pser.index.value_counts(ascending=True), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(normalize=True, dropna=False), + pser.index.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + kser.index.value_counts(ascending=True, dropna=False), + pser.index.value_counts(ascending=True, dropna=False), + almost=True, + ) def test_value_counts(self): - if LooseVersion(pyspark.__version__) < LooseVersion("2.4") and \ - default_session().conf.get("spark.sql.execution.arrow.enabled") == "true": + if ( + LooseVersion(pyspark.__version__) < LooseVersion("2.4") + and default_session().conf.get("spark.sql.execution.arrow.enabled") == "true" + ): default_session().conf.set("spark.sql.execution.arrow.enabled", "false") try: self._test_value_counts() @@ -402,28 +529,29 @@ def test_value_counts(self): default_session().conf.set("spark.sql.execution.arrow.enabled", "true") self.assertRaises( RuntimeError, - lambda: ks.MultiIndex.from_tuples([('x', 'a'), ('x', 'b')]).value_counts()) + lambda: ks.MultiIndex.from_tuples([("x", "a"), ("x", "b")]).value_counts(), + ) else: self._test_value_counts() def test_nsmallest(self): sample_lst = [1, 2, 3, 4, np.nan, 6] - pser = pd.Series(sample_lst, name='x') - kser = ks.Series(sample_lst, name='x') + pser = pd.Series(sample_lst, name="x") + kser = ks.Series(sample_lst, name="x") self.assert_eq(kser.nsmallest(n=3), pser.nsmallest(n=3)) self.assert_eq(kser.nsmallest(), pser.nsmallest()) self.assert_eq((kser + 1).nsmallest(), (pser + 1).nsmallest()) def test_nlargest(self): sample_lst = [1, 2, 3, 4, np.nan, 6] - pser = pd.Series(sample_lst, name='x') - kser = ks.Series(sample_lst, name='x') + pser = pd.Series(sample_lst, name="x") + kser = ks.Series(sample_lst, name="x") self.assert_eq(kser.nlargest(n=3), pser.nlargest(n=3)) self.assert_eq(kser.nlargest(), pser.nlargest()) self.assert_eq((kser + 1).nlargest(), (pser + 1).nlargest()) def test_isnull(self): - pser = pd.Series([1, 2, 3, 4, np.nan, 6], name='x') + pser = pd.Series([1, 2, 3, 4, np.nan, 6], name="x") kser = ks.from_pandas(pser) self.assert_eq(kser.notnull(), pser.notnull()) @@ -436,99 +564,109 @@ def test_isnull(self): self.assert_eq(kser.isnull(), pser.isnull()) def test_all(self): - for pser in [pd.Series([True, True], name='x'), - pd.Series([True, False], name='x'), - pd.Series([0, 1], name='x'), - pd.Series([1, 2, 3], name='x'), - pd.Series([True, True, None], name='x'), - pd.Series([True, False, None], name='x'), - pd.Series([], name='x'), - pd.Series([np.nan], name='x')]: + for pser in [ + pd.Series([True, True], name="x"), + pd.Series([True, False], name="x"), + pd.Series([0, 1], name="x"), + pd.Series([1, 2, 3], name="x"), + pd.Series([True, True, None], name="x"), + pd.Series([True, False, None], name="x"), + pd.Series([], name="x"), + pd.Series([np.nan], name="x"), + ]: kser = ks.from_pandas(pser) self.assert_eq(kser.all(), pser.all()) - pser = pd.Series([1, 2, 3, 4], name='x') + pser = pd.Series([1, 2, 3, 4], name="x") kser = ks.from_pandas(pser) self.assert_eq((kser % 2 == 0).all(), (pser % 2 == 0).all()) with self.assertRaisesRegex( - NotImplementedError, 'axis should be either 0 or "index" currently.'): + NotImplementedError, 'axis should be either 0 or "index" currently.' + ): kser.all(axis=1) def test_any(self): - for pser in [pd.Series([False, False], name='x'), - pd.Series([True, False], name='x'), - pd.Series([0, 1], name='x'), - pd.Series([1, 2, 3], name='x'), - pd.Series([True, True, None], name='x'), - pd.Series([True, False, None], name='x'), - pd.Series([], name='x'), - pd.Series([np.nan], name='x')]: + for pser in [ + pd.Series([False, False], name="x"), + pd.Series([True, False], name="x"), + pd.Series([0, 1], name="x"), + pd.Series([1, 2, 3], name="x"), + pd.Series([True, True, None], name="x"), + pd.Series([True, False, None], name="x"), + pd.Series([], name="x"), + pd.Series([np.nan], name="x"), + ]: kser = ks.from_pandas(pser) self.assert_eq(kser.any(), pser.any()) - pser = pd.Series([1, 2, 3, 4], name='x') + pser = pd.Series([1, 2, 3, 4], name="x") kser = ks.from_pandas(pser) self.assert_eq((kser % 2 == 0).any(), (pser % 2 == 0).any()) with self.assertRaisesRegex( - NotImplementedError, 'axis should be either 0 or "index" currently.'): + NotImplementedError, 'axis should be either 0 or "index" currently.' + ): kser.any(axis=1) def test_reset_index_with_default_index_types(self): - pser = pd.Series([1, 2, 3], name='0', index=np.random.rand(3)) + pser = pd.Series([1, 2, 3], name="0", index=np.random.rand(3)) kser = ks.from_pandas(pser) - with ks.option_context('compute.default_index_type', 'sequence'): + with ks.option_context("compute.default_index_type", "sequence"): self.assert_eq(kser.reset_index(), pser.reset_index()) - with ks.option_context('compute.default_index_type', 'distributed-sequence'): + with ks.option_context("compute.default_index_type", "distributed-sequence"): # the order might be changed. - self.assert_eq(kser.reset_index().sort_index(), - pser.reset_index()) + self.assert_eq(kser.reset_index().sort_index(), pser.reset_index()) - with ks.option_context('compute.default_index_type', 'distributed'): + with ks.option_context("compute.default_index_type", "distributed"): # the index is different. - self.assert_eq(kser.reset_index().to_pandas().reset_index(drop=True), - pser.reset_index()) + self.assert_eq( + kser.reset_index().to_pandas().reset_index(drop=True), pser.reset_index() + ) def test_sort_values(self): - pser = pd.Series([1, 2, 3, 4, 5, None, 7], name='0') + pser = pd.Series([1, 2, 3, 4, 5, None, 7], name="0") kser = ks.from_pandas(pser) self.assert_eq(repr(kser.sort_values()), repr(pser.sort_values())) - self.assert_eq(repr(kser.sort_values(ascending=False)), - repr(pser.sort_values(ascending=False))) - self.assert_eq(repr(kser.sort_values(na_position='first')), - repr(pser.sort_values(na_position='first'))) - self.assertRaises(ValueError, lambda: kser.sort_values(na_position='invalid')) + self.assert_eq( + repr(kser.sort_values(ascending=False)), repr(pser.sort_values(ascending=False)) + ) + self.assert_eq( + repr(kser.sort_values(na_position="first")), repr(pser.sort_values(na_position="first")) + ) + self.assertRaises(ValueError, lambda: kser.sort_values(na_position="invalid")) self.assert_eq(kser.sort_values(inplace=True), pser.sort_values(inplace=True)) self.assert_eq(repr(kser), repr(pser)) def test_sort_index(self): - pser = pd.Series([2, 1, np.nan], index=['b', 'a', np.nan], name='0') + pser = pd.Series([2, 1, np.nan], index=["b", "a", np.nan], name="0") kser = ks.from_pandas(pser) # Assert invalid parameters self.assertRaises(NotImplementedError, lambda: kser.sort_index(axis=1)) - self.assertRaises(NotImplementedError, lambda: kser.sort_index(kind='mergesort')) - self.assertRaises(ValueError, lambda: kser.sort_index(na_position='invalid')) + self.assertRaises(NotImplementedError, lambda: kser.sort_index(kind="mergesort")) + self.assertRaises(ValueError, lambda: kser.sort_index(na_position="invalid")) # Assert default behavior without parameters self.assert_eq(kser.sort_index(), pser.sort_index(), almost=True) # Assert sorting descending - self.assert_eq(kser.sort_index(ascending=False), - pser.sort_index(ascending=False), almost=True) + self.assert_eq( + kser.sort_index(ascending=False), pser.sort_index(ascending=False), almost=True + ) # Assert sorting NA indices first - self.assert_eq(kser.sort_index(na_position='first'), - pser.sort_index(na_position='first'), almost=True) + self.assert_eq( + kser.sort_index(na_position="first"), pser.sort_index(na_position="first"), almost=True + ) # Assert sorting inplace self.assertEqual(kser.sort_index(inplace=True), pser.sort_index(inplace=True)) self.assert_eq(kser, pser, almost=True) # Assert multi-indices - pser = pd.Series(range(4), index=[['b', 'b', 'a', 'a'], [1, 0, 1, 0]], name='0') + pser = pd.Series(range(4), index=[["b", "b", "a", "a"], [1, 0, 1, 0]], name="0") kser = ks.from_pandas(pser) self.assert_eq(kser.sort_index(), pser.sort_index(), almost=True) self.assert_eq(kser.sort_index(level=[1, 0]), pser.sort_index(level=[1, 0]), almost=True) @@ -536,45 +674,60 @@ def test_sort_index(self): self.assert_eq(kser.reset_index().sort_index(), pser.reset_index().sort_index()) def test_to_datetime(self): - pser = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 100) + pser = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 100) kser = ks.from_pandas(pser) - self.assert_eq(pd.to_datetime(pser, infer_datetime_format=True), - ks.to_datetime(kser, infer_datetime_format=True)) + self.assert_eq( + pd.to_datetime(pser, infer_datetime_format=True), + ks.to_datetime(kser, infer_datetime_format=True), + ) def test_missing(self): kser = self.kser missing_functions = inspect.getmembers(_MissingPandasLikeSeries, inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Series.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*Series.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kser, name)() - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Series.*{}.*is deprecated".format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Series.*{}.*is deprecated".format(name) + ): getattr(kser, name)() - missing_properties = inspect.getmembers(_MissingPandasLikeSeries, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeSeries, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Series.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*Series.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kser, name) - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Series.*{}.*is deprecated".format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Series.*{}.*is deprecated".format(name) + ): getattr(kser, name) def test_clip(self): @@ -598,7 +751,7 @@ def test_clip(self): self.assert_eq(kser.clip(1, 3), pser.clip(1, 3)) # Assert behavior on string values - str_kser = ks.Series(['a', 'b', 'c']) + str_kser = ks.Series(["a", "b", "c"]) self.assert_eq(str_kser.clip(1, 3), str_kser) def test_is_unique(self): @@ -628,17 +781,18 @@ def test_to_list(self): self.assertEqual(self.kser.to_list(), self.pser.to_list()) def test_append(self): - pser1 = pd.Series([1, 2, 3], name='0') - pser2 = pd.Series([4, 5, 6], name='0') - pser3 = pd.Series([4, 5, 6], index=[3, 4, 5], name='0') + pser1 = pd.Series([1, 2, 3], name="0") + pser2 = pd.Series([4, 5, 6], name="0") + pser3 = pd.Series([4, 5, 6], index=[3, 4, 5], name="0") kser1 = ks.from_pandas(pser1) kser2 = ks.from_pandas(pser2) kser3 = ks.from_pandas(pser3) self.assert_eq(kser1.append(kser2), pser1.append(pser2)) self.assert_eq(kser1.append(kser3), pser1.append(pser3)) - self.assert_eq(kser1.append(kser2, ignore_index=True), - pser1.append(pser2, ignore_index=True)) + self.assert_eq( + kser1.append(kser2, ignore_index=True), pser1.append(pser2, ignore_index=True) + ) kser1.append(kser3, verify_integrity=True) msg = "Indices have overlapping values" @@ -646,47 +800,49 @@ def test_append(self): kser1.append(kser2, verify_integrity=True) def test_map(self): - pser = pd.Series(['cat', 'dog', None, 'rabbit']) + pser = pd.Series(["cat", "dog", None, "rabbit"]) kser = ks.from_pandas(pser) # Currently Koalas doesn't return NaN as Pandas does. self.assertEqual( - repr(kser.map({})), - repr(pser.map({}).replace({pd.np.nan: None}).rename(0))) + repr(kser.map({})), repr(pser.map({}).replace({pd.np.nan: None}).rename(0)) + ) d = defaultdict(lambda: "abc") self.assertTrue("abc" in repr(kser.map(d))) - self.assertEqual( - repr(kser.map(d)), - repr(pser.map(d).rename(0))) + self.assertEqual(repr(kser.map(d)), repr(pser.map(d).rename(0))) def tomorrow(date) -> datetime: return date + timedelta(days=1) pser = pd.Series([datetime(2019, 10, 24)]) kser = ks.from_pandas(pser) - self.assertEqual( - repr(kser.map(tomorrow)), - repr(pser.map(tomorrow).rename(0))) + self.assertEqual(repr(kser.map(tomorrow)), repr(pser.map(tomorrow).rename(0))) def test_add_prefix(self): - pser = pd.Series([1, 2, 3, 4], name='0') + pser = pd.Series([1, 2, 3, 4], name="0") kser = ks.from_pandas(pser) - self.assert_eq(pser.add_prefix('item_'), kser.add_prefix('item_')) + self.assert_eq(pser.add_prefix("item_"), kser.add_prefix("item_")) - pser = pd.Series([1, 2, 3], name='0', - index=pd.MultiIndex.from_tuples([('A', 'X'), ('A', 'Y'), ('B', 'X')])) + pser = pd.Series( + [1, 2, 3], + name="0", + index=pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("B", "X")]), + ) kser = ks.from_pandas(pser) - self.assert_eq(pser.add_prefix('item_'), kser.add_prefix('item_')) + self.assert_eq(pser.add_prefix("item_"), kser.add_prefix("item_")) def test_add_suffix(self): - pser = pd.Series([1, 2, 3, 4], name='0') + pser = pd.Series([1, 2, 3, 4], name="0") kser = ks.from_pandas(pser) - self.assert_eq(pser.add_suffix('_item'), kser.add_suffix('_item')) + self.assert_eq(pser.add_suffix("_item"), kser.add_suffix("_item")) - pser = pd.Series([1, 2, 3], name='0', - index=pd.MultiIndex.from_tuples([('A', 'X'), ('A', 'Y'), ('B', 'X')])) + pser = pd.Series( + [1, 2, 3], + name="0", + index=pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("B", "X")]), + ) kser = ks.from_pandas(pser) - self.assert_eq(pser.add_suffix('_item'), kser.add_suffix('_item')) + self.assert_eq(pser.add_suffix("_item"), kser.add_suffix("_item")) def test_pandas_wraps(self): # This test checks the return column name of `isna()`. Previously it returned the column @@ -697,20 +853,18 @@ def f(x) -> ks.Series[int]: return 2 * x df = ks.DataFrame({"x": [1, None]}) - self.assert_eq( - f(df["x"]).isna(), - pd.Series([False, True]).rename("f(x)")) + self.assert_eq(f(df["x"]).isna(), pd.Series([False, True]).rename("f(x)")) def test_hist(self): - pdf = pd.DataFrame({ - 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50], - }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9, 10, 10]) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50],}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9, 10, 10] + ) kdf = ks.from_pandas(pdf) def plot_to_base64(ax): bytes_data = BytesIO() - ax.figure.savefig(bytes_data, format='png') + ax.figure.savefig(bytes_data, format="png") bytes_data.seek(0) b64_data = base64.b64encode(bytes_data.read()) plt.close(ax.figure) @@ -718,9 +872,9 @@ def plot_to_base64(ax): _, ax1 = plt.subplots(1, 1) # Using plot.hist() because pandas changes ticks props when called hist() - ax1 = pdf['a'].plot.hist() + ax1 = pdf["a"].plot.hist() _, ax2 = plt.subplots(1, 1) - ax2 = kdf['a'].hist() + ax2 = kdf["a"].hist() self.assert_eq(plot_to_base64(ax1), plot_to_base64(ax2)) def test_cummin(self): @@ -777,32 +931,31 @@ def test_cumprod(self): def test_median(self): with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): - ks.Series([24., 21., 25., 33., 26.]).median(accuracy="a") + ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a") def test_rank(self): - pser = pd.Series([1, 2, 3, 1], name='x') + pser = pd.Series([1, 2, 3, 1], name="x") kser = ks.from_pandas(pser) - self.assertEqual(repr(pser.rank()), - repr(kser.rank().sort_index())) - self.assertEqual(repr(pser.rank()), - repr(kser.rank().sort_index())) - self.assertEqual(repr(pser.rank(ascending=False)), - repr(kser.rank(ascending=False).sort_index())) - self.assertEqual(repr(pser.rank(method='min')), - repr(kser.rank(method='min').sort_index())) - self.assertEqual(repr(pser.rank(method='max')), - repr(kser.rank(method='max').sort_index())) - self.assertEqual(repr(pser.rank(method='first')), - repr(kser.rank(method='first').sort_index())) - self.assertEqual(repr(pser.rank(method='dense')), - repr(kser.rank(method='dense').sort_index())) + self.assertEqual(repr(pser.rank()), repr(kser.rank().sort_index())) + self.assertEqual(repr(pser.rank()), repr(kser.rank().sort_index())) + self.assertEqual( + repr(pser.rank(ascending=False)), repr(kser.rank(ascending=False).sort_index()) + ) + self.assertEqual(repr(pser.rank(method="min")), repr(kser.rank(method="min").sort_index())) + self.assertEqual(repr(pser.rank(method="max")), repr(kser.rank(method="max").sort_index())) + self.assertEqual( + repr(pser.rank(method="first")), repr(kser.rank(method="first").sort_index()) + ) + self.assertEqual( + repr(pser.rank(method="dense")), repr(kser.rank(method="dense").sort_index()) + ) msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'" with self.assertRaisesRegex(ValueError, msg): - kser.rank(method='nothing') + kser.rank(method="nothing") def test_round(self): - pser = pd.Series([0.028208, 0.038683, 0.877076], name='x') + pser = pd.Series([0.028208, 0.038683, 0.877076], name="x") kser = ks.from_pandas(pser) self.assertEqual(repr(pser.round(2)), repr(kser.round(2))) msg = "decimals must be an integer" @@ -811,21 +964,22 @@ def test_round(self): def test_quantile(self): with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): - ks.Series([24., 21., 25., 33., 26.]).quantile(accuracy="a") + ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(accuracy="a") with self.assertRaisesRegex(ValueError, "q must be a float of an array of floats;"): - ks.Series([24., 21., 25., 33., 26.]).quantile(q="a") + ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q="a") with self.assertRaisesRegex(ValueError, "q must be a float of an array of floats;"): - ks.Series([24., 21., 25., 33., 26.]).quantile(q=["a"]) + ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"]) def test_idxmax(self): - pser = pd.Series(data=[1, 4, 5], index=['A', 'B', 'C']) + pser = pd.Series(data=[1, 4, 5], index=["A", "B", "C"]) kser = ks.Series(pser) self.assertEqual(kser.idxmax(), pser.idxmax()) self.assertEqual(kser.idxmax(skipna=False), pser.idxmax(skipna=False)) - index = pd.MultiIndex.from_arrays([ - ['a', 'a', 'b', 'b'], ['c', 'd', 'e', 'f']], names=('first', 'second')) + index = pd.MultiIndex.from_arrays( + [["a", "a", "b", "b"], ["c", "d", "e", "f"]], names=("first", "second") + ) pser = pd.Series(data=[1, 2, 4, 5], index=index) kser = ks.Series(pser) @@ -843,14 +997,15 @@ def test_idxmax(self): self.assertEqual(repr(kser.idxmax(skipna=False)), repr(pser.idxmax(skipna=False))) def test_idxmin(self): - pser = pd.Series(data=[1, 4, 5], index=['A', 'B', 'C']) + pser = pd.Series(data=[1, 4, 5], index=["A", "B", "C"]) kser = ks.Series(pser) self.assertEqual(kser.idxmin(), pser.idxmin()) self.assertEqual(kser.idxmin(skipna=False), pser.idxmin(skipna=False)) - index = pd.MultiIndex.from_arrays([ - ['a', 'a', 'b', 'b'], ['c', 'd', 'e', 'f']], names=('first', 'second')) + index = pd.MultiIndex.from_arrays( + [["a", "a", "b", "b"], ["c", "d", "e", "f"]], names=("first", "second") + ) pser = pd.Series(data=[1, 2, 4, 5], index=index) kser = ks.Series(pser) @@ -868,91 +1023,95 @@ def test_idxmin(self): self.assertEqual(repr(kser.idxmin(skipna=False)), repr(pser.idxmin(skipna=False))) def test_shift(self): - pser = pd.Series([10, 20, 15, 30, 45], name='x') + pser = pd.Series([10, 20, 15, 30, 45], name="x") kser = ks.Series(pser) - if LooseVersion(pd.__version__) < LooseVersion('0.24.2'): - self.assertEqual(repr(kser.shift(periods=2)), - repr(pser.shift(periods=2))) + if LooseVersion(pd.__version__) < LooseVersion("0.24.2"): + self.assertEqual(repr(kser.shift(periods=2)), repr(pser.shift(periods=2))) else: - self.assertEqual(repr(kser.shift(periods=2, fill_value=0)), - repr(pser.shift(periods=2, fill_value=0))) - with self.assertRaisesRegex(ValueError, 'periods should be an int; however'): + self.assertEqual( + repr(kser.shift(periods=2, fill_value=0)), repr(pser.shift(periods=2, fill_value=0)) + ) + with self.assertRaisesRegex(ValueError, "periods should be an int; however"): kser.shift(periods=1.5) def test_astype(self): - pser = pd.Series([10, 20, 15, 30, 45], name='x') + pser = pd.Series([10, 20, 15, 30, 45], name="x") kser = ks.Series(pser) - with self.assertRaisesRegex(ValueError, 'Type int63 not understood'): - kser.astype('int63') + with self.assertRaisesRegex(ValueError, "Type int63 not understood"): + kser.astype("int63") def test_aggregate(self): - pser = pd.Series([10, 20, 15, 30, 45], name='x') + pser = pd.Series([10, 20, 15, 30, 45], name="x") kser = ks.Series(pser) - msg = 'func must be a string or list of strings' + msg = "func must be a string or list of strings" with self.assertRaisesRegex(ValueError, msg): - kser.aggregate({'x': ['min', 'max']}) - msg = ('If the given function is a list, it ' - 'should only contains function names as strings.') + kser.aggregate({"x": ["min", "max"]}) + msg = ( + "If the given function is a list, it " "should only contains function names as strings." + ) with self.assertRaisesRegex(ValueError, msg): - kser.aggregate(['min', max]) + kser.aggregate(["min", max]) def test_drop(self): - pser = pd.Series([10, 20, 15, 30, 45], name='x') + pser = pd.Series([10, 20, 15, 30, 45], name="x") kser = ks.Series(pser) msg = "Need to specify at least one of 'labels' or 'index'" with self.assertRaisesRegex(ValueError, msg): kser.drop() # For MultiIndex - midx = pd.MultiIndex([['lama', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + midx = pd.MultiIndex( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) msg = "'level' should be less than the number of indexes" with self.assertRaisesRegex(ValueError, msg): - kser.drop(labels='weight', level=2) - msg = ("If the given index is a list, it " - "should only contains names as strings, " - "or a list of tuples that contain " - "index names as strings") + kser.drop(labels="weight", level=2) + msg = ( + "If the given index is a list, it " + "should only contains names as strings, " + "or a list of tuples that contain " + "index names as strings" + ) with self.assertRaisesRegex(ValueError, msg): - kser.drop(['lama', ['cow', 'falcon']]) + kser.drop(["lama", ["cow", "falcon"]]) msg = "'index' type should be one of str, list, tuple" with self.assertRaisesRegex(ValueError, msg): - kser.drop({'lama': 'speed'}) + kser.drop({"lama": "speed"}) msg = "Cannot specify both 'labels' and 'index'" with self.assertRaisesRegex(ValueError, msg): - kser.drop('lama', index='cow') + kser.drop("lama", index="cow") msg = r"'Key length \(2\) exceeds index depth \(3\)'" with self.assertRaisesRegex(KeyError, msg): - kser.drop(('lama', 'speed', 'x')) - self.assert_eq(kser.drop(('lama', 'speed', 'x'), level=1), kser) + kser.drop(("lama", "speed", "x")) + self.assert_eq(kser.drop(("lama", "speed", "x"), level=1), kser) def test_pop(self): - midx = pd.MultiIndex([['lama', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2]]) - kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], - index=midx) + midx = pd.MultiIndex( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) + kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) pser = kser.to_pandas() - self.assert_eq(kser.pop(('lama', 'speed')), pser.pop(('lama', 'speed'))) + self.assert_eq(kser.pop(("lama", "speed")), pser.pop(("lama", "speed"))) msg = "'key' should be string or tuple that contains strings" with self.assertRaisesRegex(ValueError, msg): kser.pop(0) - msg = ("'key' should have index names as only strings " - "or a tuple that contain index names as only strings") + msg = ( + "'key' should have index names as only strings " + "or a tuple that contain index names as only strings" + ) with self.assertRaisesRegex(ValueError, msg): - kser.pop(('lama', 0)) + kser.pop(("lama", 0)) msg = r"'Key length \(3\) exceeds index depth \(2\)'" with self.assertRaisesRegex(KeyError, msg): - kser.pop(('lama', 'speed', 'x')) + kser.pop(("lama", "speed", "x")) def test_replace(self): - pser = pd.Series([10, 20, 15, 30, 45], name='x') + pser = pd.Series([10, 20, 15, 30, 45], name="x") kser = ks.Series(pser) self.assert_eq(kser.replace(), pser.replace()) @@ -966,39 +1125,33 @@ def test_replace(self): kser.replace([10, 20, 30], [1, 2]) msg = "replace currently not support for regex" with self.assertRaisesRegex(NotImplementedError, msg): - kser.replace(r'^1.$', regex=True) + kser.replace(r"^1.$", regex=True) def test_xs(self): - midx = pd.MultiIndex([['a', 'b', 'c'], - ['lama', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2]]) - kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], - index=midx) + midx = pd.MultiIndex( + [["a", "b", "c"], ["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) + kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) pser = kser.to_pandas() - self.assert_eq(kser.xs(('a', 'lama', 'speed')), pser.xs(('a', 'lama', 'speed'))) + self.assert_eq(kser.xs(("a", "lama", "speed")), pser.xs(("a", "lama", "speed"))) def test_duplicates(self): # test on texts - pser = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], - name='animal') + pser = pd.Series(["lama", "cow", "lama", "beetle", "lama", "hippo"], name="animal") kser = ks.Series(pser) - self.assert_eq(pser.drop_duplicates().sort_values(), - kser.drop_duplicates().sort_values()) + self.assert_eq(pser.drop_duplicates().sort_values(), kser.drop_duplicates().sort_values()) # test on numbers pser = pd.Series([1, 1, 2, 4, 3]) kser = ks.Series(pser) - self.assert_eq(pser.drop_duplicates().sort_values(), - kser.drop_duplicates().sort_values()) + self.assert_eq(pser.drop_duplicates().sort_values(), kser.drop_duplicates().sort_values()) def test_update(self): - pser = pd.Series([10, 20, 15, 30, 45], name='x') + pser = pd.Series([10, 20, 15, 30, 45], name="x") kser = ks.Series(pser) msg = "'other' must be a Series" @@ -1009,15 +1162,13 @@ def test_where(self): pser1 = pd.Series([0, 1, 2, 3, 4], name=0) kser1 = ks.from_pandas(pser1) - self.assert_eq(repr(pser1.where(pser1 > 3)), - repr(kser1.where(kser1 > 3).sort_index())) + self.assert_eq(repr(pser1.where(pser1 > 3)), repr(kser1.where(kser1 > 3).sort_index())) def test_mask(self): pser1 = pd.Series([0, 1, 2, 3, 4], name=0) kser1 = ks.from_pandas(pser1) - self.assert_eq(repr(pser1.mask(pser1 > 3)), - repr(kser1.mask(kser1 > 3).sort_index())) + self.assert_eq(repr(pser1.mask(pser1 > 3)), repr(kser1.mask(kser1 > 3).sort_index())) def test_truncate(self): pser1 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7]) @@ -1044,37 +1195,34 @@ def test_truncate(self): kser.truncate(5, 2) def test_getitem(self): - pser = pd.Series([10, 20, 15, 30, 45], ['A', 'A', 'B', 'C', 'D']) + pser = pd.Series([10, 20, 15, 30, 45], ["A", "A", "B", "C", "D"]) kser = ks.Series(pser) - self.assert_eq(kser['A'], pser['A']) - self.assert_eq(kser['B'], pser['B']) + self.assert_eq(kser["A"], pser["A"]) + self.assert_eq(kser["B"], pser["B"]) self.assert_eq(kser[kser > 15], pser[pser > 15]) # for MultiIndex - midx = pd.MultiIndex([['a', 'b', 'c'], - ['lama', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 0, 0, 0, 1, 1, 1], - [0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 0, 0, 0, 1, 2, 0, 1, 2]]) - pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], - name='0', index=midx) + midx = pd.MultiIndex( + [["a", "b", "c"], ["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]], + ) + pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], name="0", index=midx) kser = ks.Series(pser) - self.assert_eq(kser['a'], pser['a']) - self.assert_eq(kser['a', 'lama'], pser['a', 'lama']) + self.assert_eq(kser["a"], pser["a"]) + self.assert_eq(kser["a", "lama"], pser["a", "lama"]) self.assert_eq(kser[kser > 1.5], pser[pser > 1.5]) msg = r"'Key length \(4\) exceeds index depth \(3\)'" with self.assertRaisesRegex(KeyError, msg): - kser[('a', 'lama', 'speed', 'x')] + kser[("a", "lama", "speed", "x")] def test_keys(self): - midx = pd.MultiIndex([['lama', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + midx = pd.MultiIndex( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) pser = kser.to_pandas() @@ -1086,44 +1234,44 @@ def test_index(self): kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=idx) pser = kser.to_pandas() - kser.name = 'koalas' - pser.name = 'koalas' + kser.name = "koalas" + pser.name = "koalas" self.assert_eq(kser.index.name, pser.index.name) # for check setting names of MultiIndex properly. - kser.names = ['hello', 'koalas'] - pser.names = ['hello', 'koalas'] + kser.names = ["hello", "koalas"] + pser.names = ["hello", "koalas"] self.assert_eq(kser.index.names, pser.index.names) def test_pct_change(self): kser = ks.Series([90, 91, 85], index=[2, 4, 1]) pser = kser.to_pandas() - self.assert_eq(kser.pct_change(periods=-1), - pser.pct_change(periods=-1), almost=True) - self.assert_eq(kser.pct_change(periods=-100000000), - pser.pct_change(periods=-100000000), almost=True) - self.assert_eq(kser.pct_change(periods=100000000), - pser.pct_change(periods=100000000), almost=True) + self.assert_eq(kser.pct_change(periods=-1), pser.pct_change(periods=-1), almost=True) + self.assert_eq( + kser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000), almost=True + ) + self.assert_eq( + kser.pct_change(periods=100000000), pser.pct_change(periods=100000000), almost=True + ) # for MultiIndex - midx = pd.MultiIndex([['lama', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + midx = pd.MultiIndex( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) pser = kser.to_pandas() - self.assert_eq(kser.pct_change(), - pser.pct_change(), almost=True) - self.assert_eq(kser.pct_change(periods=2), - pser.pct_change(periods=2), almost=True) - self.assert_eq(kser.pct_change(periods=-1), - pser.pct_change(periods=-1), almost=True) - self.assert_eq(kser.pct_change(periods=-100000000), - pser.pct_change(periods=-100000000), almost=True) - self.assert_eq(kser.pct_change(periods=100000000), - pser.pct_change(periods=100000000), almost=True) + self.assert_eq(kser.pct_change(), pser.pct_change(), almost=True) + self.assert_eq(kser.pct_change(periods=2), pser.pct_change(periods=2), almost=True) + self.assert_eq(kser.pct_change(periods=-1), pser.pct_change(periods=-1), almost=True) + self.assert_eq( + kser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000), almost=True + ) + self.assert_eq( + kser.pct_change(periods=100000000), pser.pct_change(periods=100000000), almost=True + ) def test_axes(self): kser = ks.Series([90, 91, 85], index=[2, 4, 1]) @@ -1131,10 +1279,10 @@ def test_axes(self): self.assert_list_eq(kser.axes, pser.axes) # for MultiIndex - midx = pd.MultiIndex([['lama', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + midx = pd.MultiIndex( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) pser = kser.to_pandas() self.assert_list_eq(kser.axes, pser.axes) diff --git a/databricks/koalas/tests/test_series_conversion.py b/databricks/koalas/tests/test_series_conversion.py index 5d12559..4a4d3b1 100644 --- a/databricks/koalas/tests/test_series_conversion.py +++ b/databricks/koalas/tests/test_series_conversion.py @@ -22,10 +22,9 @@ class SeriesConversionTest(ReusedSQLTestCase, SQLTestUtils): - @property def pser(self): - return pd.Series([1, 2, 3, 4, 5, 6, 7], name='x') + return pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") @property def kser(self): @@ -36,10 +35,10 @@ def test_to_clipboard(self): kser = self.kser self.assert_eq(kser.to_clipboard(), pser.to_clipboard()) - self.assert_eq(kser.to_clipboard(excel=False), - pser.to_clipboard(excel=False)) - self.assert_eq(kser.to_clipboard(sep=',', index=False), - pser.to_clipboard(sep=',', index=False)) + self.assert_eq(kser.to_clipboard(excel=False), pser.to_clipboard(excel=False)) + self.assert_eq( + kser.to_clipboard(sep=",", index=False), pser.to_clipboard(sep=",", index=False) + ) def test_to_latex(self): pser = self.pser @@ -49,11 +48,11 @@ def test_to_latex(self): self.assert_eq(kser.to_latex(col_space=2), pser.to_latex(col_space=2)) self.assert_eq(kser.to_latex(header=True), pser.to_latex(header=True)) self.assert_eq(kser.to_latex(index=False), pser.to_latex(index=False)) - self.assert_eq(kser.to_latex(na_rep='-'), pser.to_latex(na_rep='-')) - self.assert_eq(kser.to_latex(float_format='%.1f'), pser.to_latex(float_format='%.1f')) + self.assert_eq(kser.to_latex(na_rep="-"), pser.to_latex(na_rep="-")) + self.assert_eq(kser.to_latex(float_format="%.1f"), pser.to_latex(float_format="%.1f")) self.assert_eq(kser.to_latex(sparsify=False), pser.to_latex(sparsify=False)) self.assert_eq(kser.to_latex(index_names=False), pser.to_latex(index_names=False)) self.assert_eq(kser.to_latex(bold_rows=True), pser.to_latex(bold_rows=True)) # Error in pandas - ValueError: buf is not a file name and encoding is specified. # self.assert_eq(kser.to_latex(encoding='ascii'), pser.to_latex(encoding='ascii')) - self.assert_eq(kser.to_latex(decimal=','), pser.to_latex(decimal=',')) + self.assert_eq(kser.to_latex(decimal=","), pser.to_latex(decimal=",")) diff --git a/databricks/koalas/tests/test_series_datetime.py b/databricks/koalas/tests/test_series_datetime.py index 72aebc4..ce147ff 100644 --- a/databricks/koalas/tests/test_series_datetime.py +++ b/databricks/koalas/tests/test_series_datetime.py @@ -25,16 +25,15 @@ class SeriesDateTimeTest(ReusedSQLTestCase, SQLTestUtils): - @property def pdf1(self): - date1 = pd.Series(pd.date_range('2012-1-1 12:45:31', periods=3, freq='M')) - date2 = pd.Series(pd.date_range('2013-3-11 21:45:00', periods=3, freq='W')) + date1 = pd.Series(pd.date_range("2012-1-1 12:45:31", periods=3, freq="M")) + date2 = pd.Series(pd.date_range("2013-3-11 21:45:00", periods=3, freq="W")) return pd.DataFrame(dict(start_date=date1, end_date=date2)) @property def pd_start_date(self): - return self.pdf1['start_date'] + return self.pdf1["start_date"] @property def ks_start_date(self): @@ -42,39 +41,42 @@ def ks_start_date(self): def check_func(self, func): mt.assert_series_equal( - func(self.ks_start_date).to_pandas(), - func(self.pd_start_date), - check_names=False + func(self.ks_start_date).to_pandas(), func(self.pd_start_date), check_names=False ) @unittest.skip( "It fails in certain OSs presumably due to different " - "timezone behaviours inherited from C library.") + "timezone behaviours inherited from C library." + ) def test_subtraction(self): pdf = self.pdf1 kdf = ks.from_pandas(pdf) - kdf['diff_seconds'] = kdf['end_date'] - kdf['start_date'] - 1 + kdf["diff_seconds"] = kdf["end_date"] - kdf["start_date"] - 1 - self.assertEqual(list(kdf['diff_seconds'].toPandas()), [35545499, 33644699, 31571099]) + self.assertEqual(list(kdf["diff_seconds"].toPandas()), [35545499, 33644699, 31571099]) - kdf = ks.from_pandas(pd.DataFrame({ - 'a': pd.date_range('2016-12-31', '2017-01-08', freq='D'), - 'b': pd.Series(range(9))})) - expected_error_message = 'datetime subtraction can only be applied to datetime series.' + kdf = ks.from_pandas( + pd.DataFrame( + {"a": pd.date_range("2016-12-31", "2017-01-08", freq="D"), "b": pd.Series(range(9))} + ) + ) + expected_error_message = "datetime subtraction can only be applied to datetime series." with self.assertRaisesRegex(TypeError, expected_error_message): - kdf['a'] - kdf['b'] + kdf["a"] - kdf["b"] @unittest.skip( "It fails in certain OSs presumably due to different " - "timezone behaviours inherited from C library.") + "timezone behaviours inherited from C library." + ) def test_div(self): pdf = self.pdf1 kdf = ks.from_pandas(pdf) - for u in 'D', 's', 'ms': + for u in "D", "s", "ms": duration = np.timedelta64(1, u) self.assert_eq( - (kdf['end_date'] - kdf['start_date']) / duration, - (pdf['end_date'] - pdf['start_date']) / duration) + (kdf["end_date"] - kdf["start_date"]) / duration, + (pdf["end_date"] - pdf["start_date"]) / duration, + ) @unittest.skip("It is currently failed probably for the same reason in 'test_subtraction'") def test_date(self): @@ -160,39 +162,39 @@ def test_days_in_month(self): @unittest.expectedFailure def test_tz_localize(self): - self.check_func(lambda x: x.dt.tz_localize('America/New_York')) + self.check_func(lambda x: x.dt.tz_localize("America/New_York")) @unittest.expectedFailure def test_tz_convert(self): - self.check_func(lambda x: x.dt.tz_convert('America/New_York')) + self.check_func(lambda x: x.dt.tz_convert("America/New_York")) def test_normalize(self): self.check_func(lambda x: x.dt.normalize()) def test_strftime(self): - self.check_func(lambda x: x.dt.strftime('%Y-%m-%d')) + self.check_func(lambda x: x.dt.strftime("%Y-%m-%d")) def test_round(self): - self.check_func(lambda x: x.dt.round(freq='min')) - self.check_func(lambda x: x.dt.round(freq='H')) + self.check_func(lambda x: x.dt.round(freq="min")) + self.check_func(lambda x: x.dt.round(freq="H")) def test_floor(self): - self.check_func(lambda x: x.dt.floor(freq='min')) - self.check_func(lambda x: x.dt.floor(freq='H')) + self.check_func(lambda x: x.dt.floor(freq="min")) + self.check_func(lambda x: x.dt.floor(freq="H")) def test_ceil(self): - self.check_func(lambda x: x.dt.floor(freq='min')) - self.check_func(lambda x: x.dt.floor(freq='H')) + self.check_func(lambda x: x.dt.floor(freq="min")) + self.check_func(lambda x: x.dt.floor(freq="H")) def test_month_name(self): self.check_func(lambda x: x.dt.month_name()) - self.check_func(lambda x: x.dt.month_name(locale='en_US.UTF-8')) + self.check_func(lambda x: x.dt.month_name(locale="en_US.UTF-8")) def test_day_name(self): self.check_func(lambda x: x.dt.day_name()) - self.check_func(lambda x: x.dt.day_name(locale='en_US.UTF-8')) + self.check_func(lambda x: x.dt.day_name(locale="en_US.UTF-8")) def test_unsupported_type(self): - self.assertRaisesRegex(ValueError, - 'Cannot call DatetimeMethods on type LongType', - lambda: ks.Series([0]).dt) + self.assertRaisesRegex( + ValueError, "Cannot call DatetimeMethods on type LongType", lambda: ks.Series([0]).dt + ) diff --git a/databricks/koalas/tests/test_series_plot.py b/databricks/koalas/tests/test_series_plot.py index df6e9da..3027a66 100644 --- a/databricks/koalas/tests/test_series_plot.py +++ b/databricks/koalas/tests/test_series_plot.py @@ -27,26 +27,25 @@ from databricks.koalas.testing.utils import ReusedSQLTestCase, TestUtils from databricks.koalas.plot import KoalasBoxPlot, KoalasHistPlot -matplotlib.use('agg') +matplotlib.use("agg") class SeriesPlotTest(ReusedSQLTestCase, TestUtils): - @classmethod def setUpClass(cls): super(SeriesPlotTest, cls).setUpClass() - set_option('plotting.max_rows', 1000) + set_option("plotting.max_rows", 1000) @classmethod def tearDownClass(cls): - reset_option('plotting.max_rows') + reset_option("plotting.max_rows") super(SeriesPlotTest, cls).tearDownClass() @property def pdf1(self): - return pd.DataFrame({ - 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50], - }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9, 10, 10]) + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50],}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9, 10, 10] + ) @property def kdf1(self): @@ -63,7 +62,7 @@ def pdf2(self): @staticmethod def plot_to_base64(ax): bytes_data = BytesIO() - ax.figure.savefig(bytes_data, format='png') + ax.figure.savefig(bytes_data, format="png") bytes_data.seek(0) b64_data = base64.b64encode(bytes_data.read()) plt.close(ax.figure) @@ -73,15 +72,15 @@ def test_bar_plot(self): pdf = self.pdf1 kdf = self.kdf1 - ax1 = pdf['a'].plot(kind="bar", colormap='Paired') + ax1 = pdf["a"].plot(kind="bar", colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot(kind="bar", colormap='Paired') + ax2 = kdf["a"].plot(kind="bar", colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) - ax1 = pdf['a'].plot(kind='bar', colormap='Paired') + ax1 = pdf["a"].plot(kind="bar", colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot(kind='bar', colormap='Paired') + ax2 = kdf["a"].plot(kind="bar", colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) @@ -90,13 +89,20 @@ def test_bar_plot_limited(self): kdf = self.kdf2 _, ax1 = plt.subplots(1, 1) - ax1 = pdf['id'][:1000].plot.bar(colormap='Paired') - ax1.text(1, 1, 'showing top 1000 elements only', size=6, ha='right', va='bottom', - transform=ax1.transAxes) + ax1 = pdf["id"][:1000].plot.bar(colormap="Paired") + ax1.text( + 1, + 1, + "showing top 1000 elements only", + size=6, + ha="right", + va="bottom", + transform=ax1.transAxes, + ) bin1 = self.plot_to_base64(ax1) _, ax2 = plt.subplots(1, 1) - ax2 = kdf['id'].plot.bar(colormap='Paired') + ax2 = kdf["id"].plot.bar(colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) @@ -105,15 +111,15 @@ def test_pie_plot(self): pdf = self.pdf1 kdf = self.kdf1 - ax1 = pdf['a'].plot.pie(colormap='Paired') + ax1 = pdf["a"].plot.pie(colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot.pie(colormap='Paired') + ax2 = kdf["a"].plot.pie(colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) - ax1 = pdf['a'].plot(kind='pie', colormap='Paired') + ax1 = pdf["a"].plot(kind="pie", colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot(kind='pie', colormap='Paired') + ax2 = kdf["a"].plot(kind="pie", colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) @@ -122,13 +128,20 @@ def test_pie_plot_limited(self): kdf = self.kdf2 _, ax1 = plt.subplots(1, 1) - ax1 = pdf['id'][:1000].plot.pie(colormap='Paired') - ax1.text(1, 1, 'showing top 1000 elements only', size=6, ha='right', va='bottom', - transform=ax1.transAxes) + ax1 = pdf["id"][:1000].plot.pie(colormap="Paired") + ax1.text( + 1, + 1, + "showing top 1000 elements only", + size=6, + ha="right", + va="bottom", + transform=ax1.transAxes, + ) bin1 = self.plot_to_base64(ax1) _, ax2 = plt.subplots(1, 1) - ax2 = kdf['id'].plot.pie(colormap='Paired') + ax2 = kdf["id"].plot.pie(colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) @@ -137,15 +150,15 @@ def test_line_plot(self): pdf = self.pdf1 kdf = self.kdf1 - ax1 = pdf['a'].plot(kind="line", colormap='Paired') + ax1 = pdf["a"].plot(kind="line", colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot(kind="line", colormap='Paired') + ax2 = kdf["a"].plot(kind="line", colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) - ax1 = pdf['a'].plot.line(colormap='Paired') + ax1 = pdf["a"].plot.line(colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot.line(colormap='Paired') + ax2 = kdf["a"].plot.line(colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) @@ -153,9 +166,9 @@ def test_barh_plot(self): pdf = self.pdf1 kdf = self.kdf1 - ax1 = pdf['a'].plot(kind="barh", colormap='Paired') + ax1 = pdf["a"].plot(kind="barh", colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot(kind="barh", colormap='Paired') + ax2 = kdf["a"].plot(kind="barh", colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) @@ -164,13 +177,20 @@ def test_barh_plot_limited(self): kdf = self.kdf2 _, ax1 = plt.subplots(1, 1) - ax1 = pdf['id'][:1000].plot.barh(colormap='Paired') - ax1.text(1, 1, 'showing top 1000 elements only', size=6, ha='right', va='bottom', - transform=ax1.transAxes) + ax1 = pdf["id"][:1000].plot.barh(colormap="Paired") + ax1.text( + 1, + 1, + "showing top 1000 elements only", + size=6, + ha="right", + va="bottom", + transform=ax1.transAxes, + ) bin1 = self.plot_to_base64(ax1) _, ax2 = plt.subplots(1, 1) - ax2 = kdf['id'].plot.barh(colormap='Paired') + ax2 = kdf["id"].plot.barh(colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) @@ -180,78 +200,83 @@ def test_hist_plot(self): kdf = self.kdf1 _, ax1 = plt.subplots(1, 1) - ax1 = pdf['a'].plot.hist() + ax1 = pdf["a"].plot.hist() bin1 = self.plot_to_base64(ax1) _, ax2 = plt.subplots(1, 1) - ax2 = kdf['a'].plot.hist() + ax2 = kdf["a"].plot.hist() bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) - ax1 = pdf['a'].plot.hist(bins=15) + ax1 = pdf["a"].plot.hist(bins=15) bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot.hist(bins=15) + ax2 = kdf["a"].plot.hist(bins=15) bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) - ax1 = pdf['a'].plot(kind='hist', bins=15) + ax1 = pdf["a"].plot(kind="hist", bins=15) bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot(kind='hist', bins=15) + ax2 = kdf["a"].plot(kind="hist", bins=15) bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) - ax1 = pdf['a'].plot.hist(bins=3, bottom=[2, 1, 3]) + ax1 = pdf["a"].plot.hist(bins=3, bottom=[2, 1, 3]) bin1 = self.plot_to_base64(ax1) - ax2 = kdf['a'].plot.hist(bins=3, bottom=[2, 1, 3]) + ax2 = kdf["a"].plot.hist(bins=3, bottom=[2, 1, 3]) bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) def test_compute_hist(self): kdf = self.kdf1 expected_bins = np.linspace(1, 50, 11) - bins = KoalasHistPlot._get_bins(kdf[['a']].to_spark(), 10) + bins = KoalasHistPlot._get_bins(kdf[["a"]].to_spark(), 10) expected_histogram = np.array([5, 4, 1, 0, 0, 0, 0, 0, 0, 1]) - histogram = KoalasHistPlot._compute_hist(kdf[['a']].to_spark(), bins) + histogram = KoalasHistPlot._compute_hist(kdf[["a"]].to_spark(), bins) self.assert_eq(pd.Series(expected_bins), pd.Series(bins)) self.assert_eq(pd.Series(expected_histogram), histogram) def test_area_plot(self): - pdf = pd.DataFrame({ - 'sales': [3, 2, 3, 9, 10, 6], - 'signups': [5, 5, 6, 12, 14, 13], - 'visits': [20, 42, 28, 62, 81, 50], - }, index=pd.date_range(start='2018/01/01', end='2018/07/01', freq='M')) + pdf = pd.DataFrame( + { + "sales": [3, 2, 3, 9, 10, 6], + "signups": [5, 5, 6, 12, 14, 13], + "visits": [20, 42, 28, 62, 81, 50], + }, + index=pd.date_range(start="2018/01/01", end="2018/07/01", freq="M"), + ) kdf = ks.from_pandas(pdf) - ax1 = pdf['sales'].plot(kind="area", colormap='Paired') + ax1 = pdf["sales"].plot(kind="area", colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf['sales'].plot(kind="area", colormap='Paired') + ax2 = kdf["sales"].plot(kind="area", colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) - ax1 = pdf['sales'].plot.area(colormap='Paired') + ax1 = pdf["sales"].plot.area(colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf['sales'].plot.area(colormap='Paired') + ax2 = kdf["sales"].plot.area(colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) # just a sanity check for df.col type - ax1 = pdf.sales.plot(kind="area", colormap='Paired') + ax1 = pdf.sales.plot(kind="area", colormap="Paired") bin1 = self.plot_to_base64(ax1) - ax2 = kdf.sales.plot(kind="area", colormap='Paired') + ax2 = kdf.sales.plot(kind="area", colormap="Paired") bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) def test_box_plot(self): def check_box_plot(pdf, kdf, *args, **kwargs): _, ax1 = plt.subplots(1, 1) - ax1 = pdf['a'].plot.box(*args, **kwargs) + ax1 = pdf["a"].plot.box(*args, **kwargs) _, ax2 = plt.subplots(1, 1) - ax2 = kdf['a'].plot.box(*args, **kwargs) + ax2 = kdf["a"].plot.box(*args, **kwargs) - diffs = [np.array([0, .5, 0, .5, 0, -.5, 0, -.5, 0, .5]), - np.array([0, .5, 0, 0]), - np.array([0, -.5, 0, 0])] + diffs = [ + np.array([0, 0.5, 0, 0.5, 0, -0.5, 0, -0.5, 0, 0.5]), + np.array([0, 0.5, 0, 0]), + np.array([0, -0.5, 0, 0]), + ] try: for i, (line1, line2) in enumerate(zip(ax1.get_lines(), ax2.get_lines())): @@ -266,47 +291,47 @@ def check_box_plot(pdf, kdf, *args, **kwargs): check_box_plot(self.pdf1, self.kdf1) check_box_plot(self.pdf1, self.kdf1, showfliers=True) - check_box_plot(self.pdf1, self.kdf1, sym='') - check_box_plot(self.pdf1, self.kdf1, sym='.', color='r') - check_box_plot(self.pdf1, self.kdf1, use_index=False, labels=['Test']) + check_box_plot(self.pdf1, self.kdf1, sym="") + check_box_plot(self.pdf1, self.kdf1, sym=".", color="r") + check_box_plot(self.pdf1, self.kdf1, use_index=False, labels=["Test"]) check_box_plot(self.pdf1, self.kdf1, usermedians=[2.0]) check_box_plot(self.pdf1, self.kdf1, conf_intervals=[(1.0, 3.0)]) val = (1, 3) self.assertRaises( - ValueError, - lambda: check_box_plot(self.pdf1, self.kdf1, usermedians=[2.0, 3.0])) + ValueError, lambda: check_box_plot(self.pdf1, self.kdf1, usermedians=[2.0, 3.0]) + ) self.assertRaises( - ValueError, - lambda: check_box_plot(self.pdf1, self.kdf1, conf_intervals=[val, val])) + ValueError, lambda: check_box_plot(self.pdf1, self.kdf1, conf_intervals=[val, val]) + ) self.assertRaises( - ValueError, - lambda: check_box_plot(self.pdf1, self.kdf1, conf_intervals=[(1,)])) + ValueError, lambda: check_box_plot(self.pdf1, self.kdf1, conf_intervals=[(1,)]) + ) def test_box_summary(self): kdf = self.kdf1 pdf = self.pdf1 k = 1.5 - stats, fences = KoalasBoxPlot._compute_stats(kdf['a'], 'a', whis=k, precision=0.01) - outliers = KoalasBoxPlot._outliers(kdf['a'], 'a', *fences) - whiskers = KoalasBoxPlot._calc_whiskers('a', outliers) - fliers = KoalasBoxPlot._get_fliers('a', outliers) + stats, fences = KoalasBoxPlot._compute_stats(kdf["a"], "a", whis=k, precision=0.01) + outliers = KoalasBoxPlot._outliers(kdf["a"], "a", *fences) + whiskers = KoalasBoxPlot._calc_whiskers("a", outliers) + fliers = KoalasBoxPlot._get_fliers("a", outliers) - expected_mean = pdf['a'].mean() - expected_median = pdf['a'].median() - expected_q1 = np.percentile(pdf['a'], 25) - expected_q3 = np.percentile(pdf['a'], 75) - iqr = (expected_q3 - expected_q1) + expected_mean = pdf["a"].mean() + expected_median = pdf["a"].median() + expected_q1 = np.percentile(pdf["a"], 25) + expected_q3 = np.percentile(pdf["a"], 75) + iqr = expected_q3 - expected_q1 expected_fences = (expected_q1 - k * iqr, expected_q3 + k * iqr) - pdf['outlier'] = ~pdf['a'].between(fences[0], fences[1]) - expected_whiskers = pdf.query('not outlier')['a'].min(), pdf.query('not outlier')['a'].max() - expected_fliers = pdf.query('outlier')['a'].values - - self.assertEqual(expected_mean, stats['mean']) - self.assertEqual(expected_median, stats['med']) - self.assertEqual(expected_q1, stats['q1'] + .5) - self.assertEqual(expected_q3, stats['q3'] - .5) + pdf["outlier"] = ~pdf["a"].between(fences[0], fences[1]) + expected_whiskers = pdf.query("not outlier")["a"].min(), pdf.query("not outlier")["a"].max() + expected_fliers = pdf.query("outlier")["a"].values + + self.assertEqual(expected_mean, stats["mean"]) + self.assertEqual(expected_median, stats["med"]) + self.assertEqual(expected_q1, stats["q1"] + 0.5) + self.assertEqual(expected_q3, stats["q3"] - 0.5) self.assertEqual(expected_fences[0], fences[0] + 2.0) self.assertEqual(expected_fences[1], fences[1] - 2.0) self.assertEqual(expected_whiskers[0], whiskers[0]) @@ -317,13 +342,13 @@ def test_kde_plot(self): def moving_average(a, n=10): ret = np.cumsum(a, dtype=float) ret[n:] = ret[n:] - ret[:-n] - return ret[n - 1:] / n + return ret[n - 1 :] / n def check_kde_plot(pdf, kdf, *args, **kwargs): _, ax1 = plt.subplots(1, 1) - ax1 = pdf['a'].plot.kde(*args, **kwargs) + ax1 = pdf["a"].plot.kde(*args, **kwargs) _, ax2 = plt.subplots(1, 1) - ax2 = kdf['a'].plot.kde(*args, **kwargs) + ax2 = kdf["a"].plot.kde(*args, **kwargs) try: for i, (line1, line2) in enumerate(zip(ax1.get_lines(), ax2.get_lines())): @@ -336,8 +361,8 @@ def check_kde_plot(pdf, kdf, *args, **kwargs): # Note: Data is from 1 to 50. So, it smooths them by moving average and compares # both. self.assertTrue( - np.allclose(moving_average(actual), - moving_average(expected), rtol=3)) + np.allclose(moving_average(actual), moving_average(expected), rtol=3) + ) finally: ax1.cla() ax2.cla() @@ -346,12 +371,11 @@ def check_kde_plot(pdf, kdf, *args, **kwargs): check_kde_plot(self.pdf1, self.kdf1, ind=[1, 2, 3, 4, 5], bw_method=3.0) def test_empty_hist(self): - pdf = self.pdf1.assign(categorical='A') + pdf = self.pdf1.assign(categorical="A") kdf = ks.from_pandas(pdf) - kser = kdf['categorical'] + kser = kdf["categorical"] - with self.assertRaisesRegex(TypeError, - "Empty 'DataFrame': no numeric data to plot"): + with self.assertRaisesRegex(TypeError, "Empty 'DataFrame': no numeric data to plot"): kser.plot.hist() def test_single_value_hist(self): @@ -359,9 +383,9 @@ def test_single_value_hist(self): kdf = ks.from_pandas(pdf) _, ax1 = plt.subplots(1, 1) - ax1 = pdf['single'].plot.hist() + ax1 = pdf["single"].plot.hist() bin1 = self.plot_to_base64(ax1) _, ax2 = plt.subplots(1, 1) - ax2 = kdf['single'].plot.hist() + ax2 = kdf["single"].plot.hist() bin2 = self.plot_to_base64(ax2) self.assertEqual(bin1, bin2) diff --git a/databricks/koalas/tests/test_series_string.py b/databricks/koalas/tests/test_series_string.py index 8d231b4..117ddbc 100644 --- a/databricks/koalas/tests/test_series_string.py +++ b/databricks/koalas/tests/test_series_string.py @@ -25,88 +25,94 @@ class SeriesStringTest(ReusedSQLTestCase, SQLTestUtils): - @property def pser(self): - return pd.Series(['apples', 'Bananas', 'carrots', '1', '100', '', - '\nleading-whitespace', 'trailing-Whitespace \t', - None, np.NaN]) + return pd.Series( + [ + "apples", + "Bananas", + "carrots", + "1", + "100", + "", + "\nleading-whitespace", + "trailing-Whitespace \t", + None, + np.NaN, + ] + ) def check_func(self, func): self.check_func_on_series(func, self.pser) def check_func_on_series(self, func, pser): kser = ks.from_pandas(pser) - mt.assert_series_equal( - func(kser).toPandas(), - func(pser), - check_names=False - ) + mt.assert_series_equal(func(kser).toPandas(), func(pser), check_names=False) def test_string_add_str_num(self): - pdf = pd.DataFrame(dict(col1=['a'], col2=[1])) + pdf = pd.DataFrame(dict(col1=["a"], col2=[1])) kdf = ks.from_pandas(pdf) with self.assertRaises(TypeError): - kdf['col1'] + kdf['col2'] + kdf["col1"] + kdf["col2"] def test_string_add_assign(self): - pdf = pd.DataFrame(dict(col1=['a', 'b', 'c'], col2=['1', '2', '3'])) + pdf = pd.DataFrame(dict(col1=["a", "b", "c"], col2=["1", "2", "3"])) kdf = ks.from_pandas(pdf) - kdf['col1'] += kdf['col2'] - pdf['col1'] += pdf['col2'] - self.assert_eq(kdf['col1'], pdf['col1']) + kdf["col1"] += kdf["col2"] + pdf["col1"] += pdf["col2"] + self.assert_eq(kdf["col1"], pdf["col1"]) def test_string_add_str_str(self): - pdf = pd.DataFrame(dict(col1=['a', 'b', 'c'], col2=['1', '2', '3'])) + pdf = pd.DataFrame(dict(col1=["a", "b", "c"], col2=["1", "2", "3"])) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf['col1'] + kdf['col2'], pdf['col1'] + pdf['col2']) - self.assert_eq(kdf['col2'] + kdf['col1'], pdf['col2'] + pdf['col1']) + self.assert_eq(kdf["col1"] + kdf["col2"], pdf["col1"] + pdf["col2"]) + self.assert_eq(kdf["col2"] + kdf["col1"], pdf["col2"] + pdf["col1"]) def test_string_add_str_lit(self): - pdf = pd.DataFrame(dict(col1=['a', 'b', 'c'])) + pdf = pd.DataFrame(dict(col1=["a", "b", "c"])) kdf = ks.from_pandas(pdf) - self.assert_eq(kdf['col1'] + '_lit', pdf['col1'] + '_lit') - self.assert_eq('_lit' + kdf['col1'], '_lit' + pdf['col1']) + self.assert_eq(kdf["col1"] + "_lit", pdf["col1"] + "_lit") + self.assert_eq("_lit" + kdf["col1"], "_lit" + pdf["col1"]) def test_string_capitalize(self): - self.check_func(lambda x: x.str.capitalize()) + self.check_func(lambda x: x.str.capitalize()) def test_string_title(self): self.check_func(lambda x: x.str.title()) def test_string_lower(self): - self.check_func(lambda x: x.str.lower()) + self.check_func(lambda x: x.str.lower()) def test_string_upper(self): - self.check_func(lambda x: x.str.upper()) + self.check_func(lambda x: x.str.upper()) def test_string_swapcase(self): self.check_func(lambda x: x.str.swapcase()) def test_string_startswith(self): - pattern = 'car' + pattern = "car" self.check_func(lambda x: x.str.startswith(pattern)) self.check_func(lambda x: x.str.startswith(pattern, na=False)) def test_string_endswith(self): - pattern = 's' + pattern = "s" self.check_func(lambda x: x.str.endswith(pattern)) self.check_func(lambda x: x.str.endswith(pattern, na=False)) def test_string_strip(self): self.check_func(lambda x: x.str.strip()) - self.check_func(lambda x: x.str.strip('es\t')) - self.check_func(lambda x: x.str.strip('1')) + self.check_func(lambda x: x.str.strip("es\t")) + self.check_func(lambda x: x.str.strip("1")) def test_string_lstrip(self): self.check_func(lambda x: x.str.lstrip()) - self.check_func(lambda x: x.str.lstrip('\n1le')) - self.check_func(lambda x: x.str.lstrip('s')) + self.check_func(lambda x: x.str.lstrip("\n1le")) + self.check_func(lambda x: x.str.lstrip("s")) def test_string_rstrip(self): self.check_func(lambda x: x.str.rstrip()) - self.check_func(lambda x: x.str.rstrip('\t ec')) - self.check_func(lambda x: x.str.rstrip('0')) + self.check_func(lambda x: x.str.rstrip("\t ec")) + self.check_func(lambda x: x.str.rstrip("0")) def test_string_get(self): self.check_func(lambda x: x.str.get(6)) @@ -147,84 +153,84 @@ def test_string_cat(self): def test_string_center(self): self.check_func(lambda x: x.str.center(0)) self.check_func(lambda x: x.str.center(10)) - self.check_func(lambda x: x.str.center(10, 'x')) + self.check_func(lambda x: x.str.center(10, "x")) def test_string_contains(self): - self.check_func(lambda x: x.str.contains('le', regex=False)) - self.check_func(lambda x: x.str.contains('White', case=True, regex=False)) - self.check_func(lambda x: x.str.contains('apples|carrots', regex=True)) - self.check_func(lambda x: x.str.contains('BANANAS', flags=re.IGNORECASE, na=False)) + self.check_func(lambda x: x.str.contains("le", regex=False)) + self.check_func(lambda x: x.str.contains("White", case=True, regex=False)) + self.check_func(lambda x: x.str.contains("apples|carrots", regex=True)) + self.check_func(lambda x: x.str.contains("BANANAS", flags=re.IGNORECASE, na=False)) def test_string_count(self): - self.check_func(lambda x: x.str.count('wh|Wh')) - self.check_func(lambda x: x.str.count('WH', flags=re.IGNORECASE)) + self.check_func(lambda x: x.str.count("wh|Wh")) + self.check_func(lambda x: x.str.count("WH", flags=re.IGNORECASE)) def test_string_decode(self): kser = ks.from_pandas(self.pser) with self.assertRaises(NotImplementedError): - kser.str.decode('utf-8') + kser.str.decode("utf-8") def test_string_encode(self): kser = ks.from_pandas(self.pser) with self.assertRaises(NotImplementedError): - kser.str.encode('utf-8') + kser.str.encode("utf-8") def test_string_extract(self): kser = ks.from_pandas(self.pser) with self.assertRaises(NotImplementedError): - kser.str.extract('pat') + kser.str.extract("pat") def test_string_extractall(self): kser = ks.from_pandas(self.pser) with self.assertRaises(NotImplementedError): - kser.str.extractall('pat') + kser.str.extractall("pat") def test_string_find(self): - self.check_func(lambda x: x.str.find('a')) - self.check_func(lambda x: x.str.find('a', start=3)) - self.check_func(lambda x: x.str.find('a', start=0, end=1)) + self.check_func(lambda x: x.str.find("a")) + self.check_func(lambda x: x.str.find("a", start=3)) + self.check_func(lambda x: x.str.find("a", start=0, end=1)) def test_string_findall(self): - self.check_func(lambda x: x.str.findall('es|as')) - self.check_func(lambda x: x.str.findall('wh.*', flags=re.IGNORECASE)) + self.check_func(lambda x: x.str.findall("es|as")) + self.check_func(lambda x: x.str.findall("wh.*", flags=re.IGNORECASE)) def test_string_index(self): - pser = pd.Series(['tea', 'eat']) - self.check_func_on_series(lambda x: x.str.index('ea'), pser) + pser = pd.Series(["tea", "eat"]) + self.check_func_on_series(lambda x: x.str.index("ea"), pser) with self.assertRaises(Exception): - self.check_func_on_series(lambda x: x.str.index('ea', start=0, end=2), pser) + self.check_func_on_series(lambda x: x.str.index("ea", start=0, end=2), pser) with self.assertRaises(Exception): - self.check_func(lambda x: x.str.index('not-found')) + self.check_func(lambda x: x.str.index("not-found")) def test_string_join(self): - pser = pd.Series([['a', 'b', 'c'], ['xx', 'yy', 'zz']]) + pser = pd.Series([["a", "b", "c"], ["xx", "yy", "zz"]]) self.check_func_on_series(lambda x: x.str.join("-"), pser) self.check_func(lambda x: x.str.join("-")) def test_string_len(self): self.check_func(lambda x: x.str.len()) - pser = pd.Series([['a', 'b', 'c'], ['xx'], []]) + pser = pd.Series([["a", "b", "c"], ["xx"], []]) self.check_func_on_series(lambda x: x.str.len(), pser) def test_string_ljust(self): self.check_func(lambda x: x.str.ljust(0)) self.check_func(lambda x: x.str.ljust(10)) - self.check_func(lambda x: x.str.ljust(30, 'x')) + self.check_func(lambda x: x.str.ljust(30, "x")) def test_string_match(self): - self.check_func(lambda x: x.str.match('in')) - self.check_func(lambda x: x.str.match('apples|carrots', na=False)) - self.check_func(lambda x: x.str.match('White', case=True)) - self.check_func(lambda x: x.str.match('BANANAS', flags=re.IGNORECASE)) + self.check_func(lambda x: x.str.match("in")) + self.check_func(lambda x: x.str.match("apples|carrots", na=False)) + self.check_func(lambda x: x.str.match("White", case=True)) + self.check_func(lambda x: x.str.match("BANANAS", flags=re.IGNORECASE)) def test_string_normalize(self): - self.check_func(lambda x: x.str.normalize('NFC')) - self.check_func(lambda x: x.str.normalize('NFKD')) + self.check_func(lambda x: x.str.normalize("NFC")) + self.check_func(lambda x: x.str.normalize("NFKD")) def test_string_pad(self): self.check_func(lambda x: x.str.pad(10)) - self.check_func(lambda x: x.str.pad(10, side='both')) - self.check_func(lambda x: x.str.pad(10, side='right', fillchar='-')) + self.check_func(lambda x: x.str.pad(10, side="both")) + self.check_func(lambda x: x.str.pad(10, side="right", fillchar="-")) def test_string_partition(self): with self.assertRaises(NotImplementedError): @@ -233,38 +239,36 @@ def test_string_partition(self): def test_string_repeat(self): self.check_func(lambda x: x.str.repeat(repeats=3)) with self.assertRaises(ValueError): - self.check_func( - lambda x: x.str.repeat(repeats=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - ) + self.check_func(lambda x: x.str.repeat(repeats=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) def test_string_replace(self): - self.check_func(lambda x: x.str.replace('a.', 'xx', regex=True)) - self.check_func(lambda x: x.str.replace('a.', 'xx', regex=False)) - self.check_func(lambda x: x.str.replace('ing', '0', flags=re.IGNORECASE)) + self.check_func(lambda x: x.str.replace("a.", "xx", regex=True)) + self.check_func(lambda x: x.str.replace("a.", "xx", regex=False)) + self.check_func(lambda x: x.str.replace("ing", "0", flags=re.IGNORECASE)) # reverse every lowercase word repl = lambda m: m.group(0)[::-1] - self.check_func(lambda x: x.str.replace(r'[a-z]+', repl)) + self.check_func(lambda x: x.str.replace(r"[a-z]+", repl)) # compiled regex with flags - regex_pat = re.compile(r'WHITESPACE', flags=re.IGNORECASE) - self.check_func(lambda x: x.str.replace(regex_pat, '---')) + regex_pat = re.compile(r"WHITESPACE", flags=re.IGNORECASE) + self.check_func(lambda x: x.str.replace(regex_pat, "---")) def test_string_rfind(self): - self.check_func(lambda x: x.str.rfind('a')) - self.check_func(lambda x: x.str.rfind('a', start=3)) - self.check_func(lambda x: x.str.rfind('a', start=0, end=1)) + self.check_func(lambda x: x.str.rfind("a")) + self.check_func(lambda x: x.str.rfind("a", start=3)) + self.check_func(lambda x: x.str.rfind("a", start=0, end=1)) def test_string_rindex(self): - pser = pd.Series(['teatea', 'eateat']) - self.check_func_on_series(lambda x: x.str.rindex('ea'), pser) + pser = pd.Series(["teatea", "eateat"]) + self.check_func_on_series(lambda x: x.str.rindex("ea"), pser) with self.assertRaises(Exception): - self.check_func_on_series(lambda x: x.str.rindex('ea', start=0, end=2), pser) + self.check_func_on_series(lambda x: x.str.rindex("ea", start=0, end=2), pser) with self.assertRaises(Exception): - self.check_func(lambda x: x.str.rindex('not-found')) + self.check_func(lambda x: x.str.rindex("not-found")) def test_string_rjust(self): self.check_func(lambda x: x.str.rjust(0)) self.check_func(lambda x: x.str.rjust(10)) - self.check_func(lambda x: x.str.rjust(30, 'x')) + self.check_func(lambda x: x.str.rjust(30, "x")) def test_string_rpartition(self): with self.assertRaises(NotImplementedError): @@ -277,30 +281,30 @@ def test_string_slice(self): self.check_func(lambda x: x.str.slice(start=0, stop=5, step=3)) def test_string_slice_replace(self): - self.check_func(lambda x: x.str.slice_replace(1, repl='X')) - self.check_func(lambda x: x.str.slice_replace(stop=2, repl='X')) - self.check_func(lambda x: x.str.slice_replace(start=1, stop=3, repl='X')) + self.check_func(lambda x: x.str.slice_replace(1, repl="X")) + self.check_func(lambda x: x.str.slice_replace(stop=2, repl="X")) + self.check_func(lambda x: x.str.slice_replace(start=1, stop=3, repl="X")) def test_string_split(self): self.check_func(lambda x: x.str.split()) - self.check_func(lambda x: x.str.split(r'p*')) - pser = pd.Series(['This is a sentence.', 'This-is-a-long-word.']) + self.check_func(lambda x: x.str.split(r"p*")) + pser = pd.Series(["This is a sentence.", "This-is-a-long-word."]) self.check_func_on_series(lambda x: x.str.split(n=2), pser) - self.check_func_on_series(lambda x: x.str.split(pat='-', n=2), pser) + self.check_func_on_series(lambda x: x.str.split(pat="-", n=2), pser) with self.assertRaises(NotImplementedError): self.check_func(lambda x: x.str.split(expand=True)) def test_string_rsplit(self): self.check_func(lambda x: x.str.rsplit()) - self.check_func(lambda x: x.str.rsplit(r'p*')) - pser = pd.Series(['This is a sentence.', 'This-is-a-long-word.']) + self.check_func(lambda x: x.str.rsplit(r"p*")) + pser = pd.Series(["This is a sentence.", "This-is-a-long-word."]) self.check_func_on_series(lambda x: x.str.rsplit(n=2), pser) - self.check_func_on_series(lambda x: x.str.rsplit(pat='-', n=2), pser) + self.check_func_on_series(lambda x: x.str.rsplit(pat="-", n=2), pser) with self.assertRaises(NotImplementedError): self.check_func(lambda x: x.str.rsplit(expand=True)) def test_string_translate(self): - m = str.maketrans({'a': 'X', 'e': 'Y', 'i': None}) + m = str.maketrans({"a": "X", "e": "Y", "i": None}) self.check_func(lambda x: x.str.translate(m)) def test_string_wrap(self): diff --git a/databricks/koalas/tests/test_sql.py b/databricks/koalas/tests/test_sql.py index 3ffe79f..57ed8b2 100644 --- a/databricks/koalas/tests/test_sql.py +++ b/databricks/koalas/tests/test_sql.py @@ -21,18 +21,17 @@ class SQLTest(ReusedSQLTestCase, SQLTestUtils): - def test_error_variable_not_exist(self): - msg = 'The key variable_foo in the SQL statement was not found.*' + msg = "The key variable_foo in the SQL statement was not found.*" with self.assertRaisesRegex(ValueError, msg): - ks.sql('select * from {variable_foo}') + ks.sql("select * from {variable_foo}") def test_error_unsupported_type(self): msg = "Unsupported variable type : {'a': 1}" with self.assertRaisesRegex(ValueError, msg): - some_dict = {'a': 1} - ks.sql('select * from {some_dict}') + some_dict = {"a": 1} + ks.sql("select * from {some_dict}") def test_error_bad_sql(self): with self.assertRaises(ParseException): - ks.sql('this is not valid sql') + ks.sql("this is not valid sql") diff --git a/databricks/koalas/tests/test_stats.py b/databricks/koalas/tests/test_stats.py index fd4eb04..9314a6b 100644 --- a/databricks/koalas/tests/test_stats.py +++ b/databricks/koalas/tests/test_stats.py @@ -25,47 +25,48 @@ class StatsTest(ReusedSQLTestCase, SQLTestUtils): - def _test_stat_functions(self, pdf, kdf): - functions = ['max', 'min', 'mean', 'sum'] + functions = ["max", "min", "mean", "sum"] for funcname in functions: self.assert_eq(getattr(kdf.A, funcname)(), getattr(pdf.A, funcname)()) self.assert_eq(getattr(kdf, funcname)(), getattr(pdf, funcname)()) - functions = ['std', 'var'] + functions = ["std", "var"] for funcname in functions: self.assert_eq(getattr(kdf.A, funcname)(), getattr(pdf.A, funcname)(), almost=True) self.assert_eq(getattr(kdf, funcname)(), getattr(pdf, funcname)(), almost=True) # NOTE: To test skew, kurt, and median, just make sure they run. # The numbers are different in spark and pandas. - functions = ['skew', 'kurt', 'median'] + functions = ["skew", "kurt", "median"] for funcname in functions: getattr(kdf.A, funcname)() getattr(kdf, funcname)() def test_stat_functions(self): - pdf = pd.DataFrame({'A': [1, 2, 3, 4], - 'B': [1.0, 2.1, 3, 4]}) + pdf = pd.DataFrame({"A": [1, 2, 3, 4], "B": [1.0, 2.1, 3, 4]}) kdf = ks.from_pandas(pdf) self._test_stat_functions(pdf, kdf) def test_stat_functions_multiindex_column(self): - arrays = [np.array(['A', 'A', 'B', 'B']), - np.array(['one', 'two', 'one', 'two'])] - pdf = pd.DataFrame(np.random.randn(3, 4), index=['A', 'B', 'C'], columns=arrays) + arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])] + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) kdf = ks.from_pandas(pdf) self._test_stat_functions(pdf, kdf) def test_abs(self): - pdf = pd.DataFrame({'A': [1, -2, 3, -4, 5], - 'B': [1., -2, 3, -4, 5], - 'C': [-6., -7, -8, -9, 10], - 'D': ['a', 'b', 'c', 'd', 'e']}) + pdf = pd.DataFrame( + { + "A": [1, -2, 3, -4, 5], + "B": [1.0, -2, 3, -4, 5], + "C": [-6.0, -7, -8, -9, 10], + "D": ["a", "b", "c", "d", "e"], + } + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.A.abs(), pdf.A.abs()) self.assert_eq(kdf.B.abs(), pdf.B.abs()) - self.assert_eq(kdf[['B', 'C']].abs(), pdf[['B', 'C']].abs()) + self.assert_eq(kdf[["B", "C"]].abs(), pdf[["B", "C"]].abs()) # self.assert_eq(kdf.select('A', 'B').abs(), pdf[['A', 'B']].abs()) def test_axis_on_dataframe(self): @@ -74,11 +75,15 @@ def test_axis_on_dataframe(self): # Less than 'compute.shortcut_limit' will execute a shortcut # by using collected pandas dataframe directly. # now we set the 'compute.shortcut_limit' as 1000 explicitly - with option_context('compute.shortcut_limit', 1000): - pdf = pd.DataFrame({'A': [1, -2, 3, -4, 5] * 300, - 'B': [1., -2, 3, -4, 5] * 300, - 'C': [-6., -7, -8, -9, 10] * 300, - 'D': [True, False, True, False, False] * 300}) + with option_context("compute.shortcut_limit", 1000): + pdf = pd.DataFrame( + { + "A": [1, -2, 3, -4, 5] * 300, + "B": [1.0, -2, 3, -4, 5] * 300, + "C": [-6.0, -7, -8, -9, 10] * 300, + "D": [True, False, True, False, False] * 300, + } + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.count(axis=1), pdf.count(axis=1)) self.assert_eq(kdf.var(axis=1), pdf.var(axis=1)) @@ -93,7 +98,7 @@ def test_axis_on_dataframe(self): def test_corr(self): # Disable arrow execution since corr() is using UDT internally which is not supported. - with self.sql_conf({'spark.sql.execution.arrow.enabled': False}): + with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): # DataFrame # we do not handle NaNs for now if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): @@ -114,38 +119,41 @@ def test_corr(self): self.assertRaises(TypeError, lambda: kser_a.corr(kdf)) # multi-index columns - columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C'), ('Z', 'D')]) + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Z", "D")]) pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.corr(), pdf.corr(), almost=True) # Series - pser_xa = pdf[('X', 'A')] - pser_xb = pdf[('X', 'B')] - kser_xa = kdf[('X', 'A')] - kser_xb = kdf[('X', 'B')] + pser_xa = pdf[("X", "A")] + pser_xb = pdf[("X", "B")] + kser_xa = kdf[("X", "A")] + kser_xb = kdf[("X", "B")] self.assertAlmostEqual(kser_xa.corr(kser_xb), pser_xa.corr(pser_xb)) def test_cov_corr_meta(self): # Disable arrow execution since corr() is using UDT internally which is not supported. - with self.sql_conf({'spark.sql.execution.arrow.enabled': False}): - pdf = pd.DataFrame({'a': np.array([1, 2, 3], dtype='i1'), - 'b': np.array([1, 2, 3], dtype='i2'), - 'c': np.array([1, 2, 3], dtype='i4'), - 'd': np.array([1, 2, 3]), - 'e': np.array([1.0, 2.0, 3.0], dtype='f4'), - 'f': np.array([1.0, 2.0, 3.0]), - 'g': np.array([True, False, True]), - 'h': np.array(list('abc'))}, - index=pd.Index([1, 2, 3], name='myindex')) + with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): + pdf = pd.DataFrame( + { + "a": np.array([1, 2, 3], dtype="i1"), + "b": np.array([1, 2, 3], dtype="i2"), + "c": np.array([1, 2, 3], dtype="i4"), + "d": np.array([1, 2, 3]), + "e": np.array([1.0, 2.0, 3.0], dtype="f4"), + "f": np.array([1.0, 2.0, 3.0]), + "g": np.array([True, False, True]), + "h": np.array(list("abc")), + }, + index=pd.Index([1, 2, 3], name="myindex"), + ) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.corr(), pdf.corr()) def test_stats_on_boolean_dataframe(self): - pdf = pd.DataFrame({'A': [True, False, True], - 'B': [False, False, True]}) + pdf = pd.DataFrame({"A": [True, False, True], "B": [False, False, True]}) kdf = ks.from_pandas(pdf) pd.testing.assert_series_equal(kdf.min(), pdf.min()) @@ -171,9 +179,7 @@ def test_stats_on_boolean_series(self): self.assertAlmostEqual(kser.std(), pser.std()) def test_some_stats_functions_should_discard_non_numeric_columns_by_default(self): - pdf = pd.DataFrame({'i': [0, 1, 2], - 'b': [False, False, True], - 's': ['x', 'y', 'z']}) + pdf = pd.DataFrame({"i": [0, 1, 2], "b": [False, False, True], "s": ["x", "y", "z"]}) kdf = ks.from_pandas(pdf) # min and max do not discard non-numeric columns by default @@ -192,9 +198,7 @@ def test_some_stats_functions_should_discard_non_numeric_columns_by_default(self self.assertEqual(len(kdf.skew()), len(pdf.skew())) def test_stats_on_non_numeric_columns_should_be_discarded_if_numeric_only_is_true(self): - pdf = pd.DataFrame({'i': [0, 1, 2], - 'b': [False, False, True], - 's': ['x', 'y', 'z']}) + pdf = pd.DataFrame({"i": [0, 1, 2], "b": [False, False, True], "s": ["x", "y", "z"]}) kdf = ks.from_pandas(pdf) self.assertEqual(len(kdf.sum(numeric_only=True)), len(pdf.sum(numeric_only=True))) @@ -207,9 +211,7 @@ def test_stats_on_non_numeric_columns_should_be_discarded_if_numeric_only_is_tru self.assertEqual(len(kdf.skew(numeric_only=True)), len(pdf.skew(numeric_only=True))) def test_stats_on_non_numeric_columns_should_not_be_discarded_if_numeric_only_is_false(self): - pdf = pd.DataFrame({'i': [0, 1, 2], - 'b': [False, False, True], - 's': ['x', 'y', 'z']}) + pdf = pd.DataFrame({"i": [0, 1, 2], "b": [False, False, True], "s": ["x", "y", "z"]}) kdf = ks.from_pandas(pdf) # the lengths are the same, but the results are different. diff --git a/databricks/koalas/tests/test_utils.py b/databricks/koalas/tests/test_utils.py index c75182c..cad3f96 100644 --- a/databricks/koalas/tests/test_utils.py +++ b/databricks/koalas/tests/test_utils.py @@ -17,8 +17,11 @@ import pandas as pd from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils -from databricks.koalas.utils import (lazy_property, validate_arguments_and_invoke_function, - validate_bool_kwarg) +from databricks.koalas.utils import ( + lazy_property, + validate_arguments_and_invoke_function, + validate_bool_kwarg, +) some_global_variable = 0 @@ -30,21 +33,16 @@ class UtilsTest(ReusedSQLTestCase, SQLTestUtils): def to_html(self, max_rows=None, unsupported_param=None): args = locals() - pdf = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - }, index=[0, 1, 3]) + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6],}, index=[0, 1, 3]) validate_arguments_and_invoke_function(pdf, self.to_html, pd.DataFrame.to_html, args) - def to_clipboard(self, sep=',', **kwargs): + def to_clipboard(self, sep=",", **kwargs): args = locals() - pdf = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - }, index=[0, 1, 3]) - validate_arguments_and_invoke_function(pdf, self.to_clipboard, - pd.DataFrame.to_clipboard, args) + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6],}, index=[0, 1, 3]) + validate_arguments_and_invoke_function( + pdf, self.to_clipboard, pd.DataFrame.to_clipboard, args + ) def test_validate_arguments_and_invoke_function(self): # This should pass and run fine @@ -58,7 +56,7 @@ def test_validate_arguments_and_invoke_function(self): self.to_html(unsupported_param=1) # Support for **kwargs - self.to_clipboard(sep=',', index=False) + self.to_clipboard(sep=",", index=False) def test_lazy_property(self): obj = TestClassForLazyProp() @@ -78,13 +76,12 @@ def test_validate_bool_kwarg(self): # This should fail because we are explicitly setting a non-boolean value koalas = "true" with self.assertRaisesRegex( - ValueError, - 'For argument "koalas" expected type bool, received type str.'): + ValueError, 'For argument "koalas" expected type bool, received type str.' + ): validate_bool_kwarg(koalas, "koalas") class TestClassForLazyProp: - def __init__(self): self.some_variable = 0 diff --git a/databricks/koalas/tests/test_window.py b/databricks/koalas/tests/test_window.py index cd2120d..b1b2019 100644 --- a/databricks/koalas/tests/test_window.py +++ b/databricks/koalas/tests/test_window.py @@ -18,233 +18,286 @@ from databricks import koalas as ks from databricks.koalas.exceptions import PandasNotImplementedError -from databricks.koalas.missing.window import _MissingPandasLikeExpanding, \ - _MissingPandasLikeRolling, _MissingPandasLikeExpandingGroupby, \ - _MissingPandasLikeRollingGroupby +from databricks.koalas.missing.window import ( + _MissingPandasLikeExpanding, + _MissingPandasLikeRolling, + _MissingPandasLikeExpandingGroupby, + _MissingPandasLikeRollingGroupby, +) from databricks.koalas.testing.utils import ReusedSQLTestCase, TestUtils class ExpandingRollingTest(ReusedSQLTestCase, TestUtils): def test_missing(self): - kdf = ks.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + kdf = ks.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) # Expanding functions - missing_functions = inspect.getmembers(_MissingPandasLikeExpanding, - inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + missing_functions = inspect.getmembers(_MissingPandasLikeExpanding, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.expanding(1), name)() # Frame with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.expanding(1), name)() # Series - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Expanding.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Expanding.*{}.*is deprecated".format(name) + ): getattr(kdf.expanding(1), name)() # Frame - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Expanding.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Expanding.*{}.*is deprecated".format(name) + ): getattr(kdf.a.expanding(1), name)() # Series # Rolling functions - missing_functions = inspect.getmembers(_MissingPandasLikeRolling, - inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + missing_functions = inspect.getmembers(_MissingPandasLikeRolling, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.rolling(1), name)() # Frame with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.rolling(1), name)() # Series - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Rolling.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Rolling.*{}.*is deprecated".format(name) + ): getattr(kdf.rolling(1), name)() # Frame - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Rolling.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Rolling.*{}.*is deprecated".format(name) + ): getattr(kdf.a.rolling(1), name)() # Series # Expanding properties - missing_properties = inspect.getmembers(_MissingPandasLikeExpanding, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeExpanding, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.expanding(1), name) # Frame with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.expanding(1), name) # Series - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Expanding.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Expanding.*{}.*is deprecated".format(name) + ): getattr(kdf.expanding(1), name) # Frame - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Expanding.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Expanding.*{}.*is deprecated".format(name) + ): getattr(kdf.a.expanding(1), name) # Series # Rolling properties - missing_properties = inspect.getmembers(_MissingPandasLikeRolling, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeRolling, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.rolling(1), name)() # Frame with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.rolling(1), name)() # Series - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Rolling.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Rolling.*{}.*is deprecated".format(name) + ): getattr(kdf.rolling(1), name)() # Frame - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Rolling.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Rolling.*{}.*is deprecated".format(name) + ): getattr(kdf.a.rolling(1), name)() # Series def test_missing_groupby(self): - kdf = ks.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + kdf = ks.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) # Expanding functions - missing_functions = inspect.getmembers(_MissingPandasLikeExpandingGroupby, - inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + missing_functions = inspect.getmembers( + _MissingPandasLikeExpandingGroupby, inspect.isfunction + ) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.groupby("a").expanding(1), name)() # Frame with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.groupby(kdf.a).expanding(1), name)() # Series - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Expanding.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Expanding.*{}.*is deprecated".format(name) + ): getattr(kdf.groupby("a").expanding(1), name)() # Frame - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Expanding.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Expanding.*{}.*is deprecated".format(name) + ): getattr(kdf.a.groupby(kdf.a).expanding(1), name)() # Series # Rolling functions - missing_functions = inspect.getmembers(_MissingPandasLikeRollingGroupby, - inspect.isfunction) - unsupported_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'unsupported_function'] + missing_functions = inspect.getmembers(_MissingPandasLikeRollingGroupby, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] for name in unsupported_functions: with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.groupby("a").rolling(1), name)() # Frame with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "method.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.groupby(kdf.a).rolling(1), name)() # Series - deprecated_functions = [name for (name, type_) in missing_functions - if type_.__name__ == 'deprecated_function'] + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] for name in deprecated_functions: - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Rolling.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Rolling.*{}.*is deprecated".format(name) + ): getattr(kdf.rolling(1), name)() # Frame - with self.assertRaisesRegex(PandasNotImplementedError, - "method.*Rolling.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Rolling.*{}.*is deprecated".format(name) + ): getattr(kdf.a.rolling(1), name)() # Series # Expanding properties - missing_properties = inspect.getmembers(_MissingPandasLikeExpandingGroupby, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeExpandingGroupby, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.groupby("a").expanding(1), name)() # Frame with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*Expanding.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.groupby(kdf.a).expanding(1), name)() # Series - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Expanding.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Expanding.*{}.*is deprecated".format(name) + ): getattr(kdf.expanding(1), name) # Frame - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Expanding.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Expanding.*{}.*is deprecated".format(name) + ): getattr(kdf.a.expanding(1), name) # Series # Rolling properties - missing_properties = inspect.getmembers(_MissingPandasLikeRollingGroupby, - lambda o: isinstance(o, property)) - unsupported_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'unsupported_property'] + missing_properties = inspect.getmembers( + _MissingPandasLikeRollingGroupby, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] for name in unsupported_properties: with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.groupby("a").rolling(1), name)() # Frame with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name)): + PandasNotImplementedError, + "property.*Rolling.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): getattr(kdf.a.groupby(kdf.a).rolling(1), name)() # Series - deprecated_properties = [name for (name, type_) in missing_properties - if type_.fget.__name__ == 'deprecated_property'] + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] for name in deprecated_properties: - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Rolling.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Rolling.*{}.*is deprecated".format(name) + ): getattr(kdf.rolling(1), name)() # Frame - with self.assertRaisesRegex(PandasNotImplementedError, - "property.*Rolling.*{}.*is deprecated" - .format(name)): + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Rolling.*{}.*is deprecated".format(name) + ): getattr(kdf.a.rolling(1), name)() # Series diff --git a/databricks/koalas/typedef.py b/databricks/koalas/typedef.py index e77b59e..d417295 100644 --- a/databricks/koalas/typedef.py +++ b/databricks/koalas/typedef.py @@ -29,6 +29,7 @@ from pyspark.sql import Column from pyspark.sql.functions import pandas_udf import pyspark.sql.types as types + try: from pyspark.sql.types import to_arrow_type, from_arrow_type except ImportError: @@ -37,8 +38,7 @@ from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. -__all__ = ['pandas_wraps', 'as_spark_type', - 'as_python_type', 'infer_pd_series_spark_type'] +__all__ = ["pandas_wraps", "as_spark_type", "as_python_type", "infer_pd_series_spark_type"] # A column of data, with the data type. @@ -55,8 +55,8 @@ def __init__(self, tpe): # Seems we cannot specify field names. I currently gave some default names # `c0, c1, ... cn`. self.tpe = types.StructType( - [types.StructField("c%s" % i, tpe[i]) - for i in range(len(tpe))]) # type: types.StructType + [types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe))] + ) # type: types.StructType def __repr__(self): return "_DataFrameType[{}]".format(self.tpe) @@ -105,24 +105,26 @@ def _to_stype(tpe) -> X: # First element of the list is the python base type _base = { - types.StringType(): [str, 'str', 'string'], + types.StringType(): [str, "str", "string"], types.BinaryType(): [bytes], - types.ByteType(): [np.int8, 'int8', 'byte'], - types.ShortType(): [np.int16, 'int16', 'short'], - types.IntegerType(): [int, 'int', np.int, np.int32], - types.LongType(): [np.int64, 'int64', 'long', 'bigint'], - types.FloatType(): [float, 'float', np.float], - types.DoubleType(): [np.float64, 'float64', 'double'], + types.ByteType(): [np.int8, "int8", "byte"], + types.ShortType(): [np.int16, "int16", "short"], + types.IntegerType(): [int, "int", np.int, np.int32], + types.LongType(): [np.int64, "int64", "long", "bigint"], + types.FloatType(): [float, "float", np.float], + types.DoubleType(): [np.float64, "float64", "double"], types.TimestampType(): [datetime.datetime, np.datetime64], types.DateType(): [datetime.date], - types.BooleanType(): [bool, 'boolean', 'bool', np.bool], - types.ArrayType(types.StringType()): [] + types.BooleanType(): [bool, "boolean", "bool", np.bool], + types.ArrayType(types.StringType()): [], } def _build_type_dict(): - return dict([(other_type, spark_type) for (spark_type, l) in _base.items() for other_type in l] - + [(spark_type, spark_type) for (spark_type, _) in _base.items()]) + return dict( + [(other_type, spark_type) for (spark_type, l) in _base.items() for other_type in l] + + [(spark_type, spark_type) for (spark_type, _) in _base.items()] + ) def _build_py_type_dict(): @@ -152,7 +154,7 @@ def as_spark_type(tpe) -> types.DataType: def spark_type_to_pandas_dtype(spark_type): """ Return the given Spark DataType to pandas dtype. """ if isinstance(spark_type, types.TimestampType): - return np.dtype('datetime64[ns]') + return np.dtype("datetime64[ns]") else: return np.dtype(to_arrow_type(spark_type).to_pandas_dtype()) @@ -168,7 +170,7 @@ def infer_pd_series_spark_type(s: pd.Series) -> types.DataType: :return: the inferred Spark data type """ dt = s.dtype - if dt == np.dtype('object'): + if dt == np.dtype("object"): if len(s) == 0 or s.isnull().all(): raise ValueError("can not infer schema from empty or null dataset") return from_arrow_type(pa.Array.from_pandas(s).type) @@ -178,7 +180,7 @@ def infer_pd_series_spark_type(s: pd.Series) -> types.DataType: return from_arrow_type(pa.from_numpy_dtype(dt)) -def _make_fun(f: typing.Callable, return_type: types.DataType, *args, **kwargs) -> 'ks.Series': +def _make_fun(f: typing.Callable, return_type: types.DataType, *args, **kwargs) -> "ks.Series": """ This function calls the function f while taking into account some of the limitations of the pandas UDF support: @@ -200,6 +202,7 @@ def _make_fun(f: typing.Callable, return_type: types.DataType, *args, **kwargs) The function is expected to have the following arguments: """ from databricks.koalas.series import Series + # All the arguments. # None for columns or the value for non-columns frozen_args = [] # type: typing.List[typing.Any] @@ -210,8 +213,9 @@ def _make_fun(f: typing.Callable, return_type: types.DataType, *args, **kwargs) frozen_args.append(None) col_args.append(arg) elif isinstance(arg, Column): - raise ValueError('A pyspark column was passed as an argument.' - ' Pass a koalas series instead') + raise ValueError( + "A pyspark column was passed as an argument." " Pass a koalas series instead" + ) else: frozen_args.append(arg) col_args.append(None) @@ -224,13 +228,14 @@ def _make_fun(f: typing.Callable, return_type: types.DataType, *args, **kwargs) if isinstance(arg, Series): col_kwargs.append((key, arg)) elif isinstance(arg, Column): - raise ValueError('A pyspark column was passed as an argument.' - ' Pass a koalas series instead') + raise ValueError( + "A pyspark column was passed as an argument." " Pass a koalas series instead" + ) else: frozen_kwargs.append((key, arg)) col_args_idxs = [idx for (idx, c) in enumerate(col_args) if c is not None] - all_indexes = (col_args_idxs + [key for (key, _) in col_kwargs]) # type: ignore + all_indexes = col_args_idxs + [key for (key, _) in col_kwargs] # type: ignore if not all_indexes: # No argument is related to spark # The function is just called through without other considerations. @@ -240,8 +245,9 @@ def _make_fun(f: typing.Callable, return_type: types.DataType, *args, **kwargs) kser = _get_kser(args, kwargs) def clean_fun(*args2): - assert len(args2) == len(all_indexes), \ - "Missing some inputs:{}!={}".format(all_indexes, [str(c) for c in args2]) + assert len(args2) == len(all_indexes), "Missing some inputs:{}!={}".format( + all_indexes, [str(c) for c in args2] + ) full_args = list(frozen_args) full_kwargs = dict(frozen_kwargs) for (arg, idx) in zip(args2, all_indexes): @@ -273,8 +279,10 @@ def clean_fun(*args2): def _get_kser(args, kwargs): from databricks.koalas.series import Series - all_cols = ([arg for arg in args if isinstance(arg, Series)] - + [arg for arg in kwargs.values() if isinstance(arg, Series)]) + + all_cols = [arg for arg in args if isinstance(arg, Series)] + [ + arg for arg in kwargs.values() if isinstance(arg, Series) + ] assert all_cols # TODO: check all the anchors return all_cols[0] @@ -365,17 +373,22 @@ def pandas_wraps(function=None, return_col=None, return_scalar=None): >>> import sys >>> fun(df.col1, arg1=sys.stdout) # doctest: +SKIP """ + def function_wrapper(f): @wraps(f) def wrapper(*args, **kwargs): # Extract the signature arguments from this function. sig_return = _infer_return_type(f, return_col, return_scalar) if not isinstance(sig_return, _Series): - raise ValueError("Expected the return type of this function to be of type column," - " but found type {}".format(sig_return)) + raise ValueError( + "Expected the return type of this function to be of type column," + " but found type {}".format(sig_return) + ) spark_return_type = sig_return.tpe return _make_fun(f, spark_return_type, *args, **kwargs) + return wrapper + if callable(function): return function_wrapper(function) else: @@ -410,7 +423,8 @@ def _infer_return_type(f, return_col=None, return_scalar=None) -> X: if not (return_col or return_sig or return_scalar): raise ValueError( "Missing type information. It should either be provided as an argument to " - "pandas_wraps, or as a python typing hint") + "pandas_wraps, or as a python typing hint" + ) if return_col is not None: if isinstance(return_col, ks.Series): return _to_stype(return_col) @@ -418,8 +432,10 @@ def _infer_return_type(f, return_col=None, return_scalar=None) -> X: return _Series(inner) if return_scalar is not None: if isinstance(return_scalar, ks.Series): - raise ValueError("Column return type {}, you should use 'return_col' to specify" - " it.".format(return_scalar)) + raise ValueError( + "Column return type {}, you should use 'return_col' to specify" + " it.".format(return_scalar) + ) inner = as_spark_type(return_scalar) return _Scalar(inner) if return_sig is not None: diff --git a/databricks/koalas/usage_logging/__init__.py b/databricks/koalas/usage_logging/__init__.py index 6666a92..2707f4a 100644 --- a/databricks/koalas/usage_logging/__init__.py +++ b/databricks/koalas/usage_logging/__init__.py @@ -30,14 +30,18 @@ from databricks.koalas.groupby import DataFrameGroupBy, SeriesGroupBy from databricks.koalas.indexes import Index, MultiIndex from databricks.koalas.missing.frame import _MissingPandasLikeDataFrame -from databricks.koalas.missing.groupby import _MissingPandasLikeDataFrameGroupBy, \ - _MissingPandasLikeSeriesGroupBy -from databricks.koalas.missing.indexes import _MissingPandasLikeIndex, \ - _MissingPandasLikeMultiIndex +from databricks.koalas.missing.groupby import ( + _MissingPandasLikeDataFrameGroupBy, + _MissingPandasLikeSeriesGroupBy, +) +from databricks.koalas.missing.indexes import _MissingPandasLikeIndex, _MissingPandasLikeMultiIndex from databricks.koalas.missing.series import _MissingPandasLikeSeries -from databricks.koalas.missing.window import _MissingPandasLikeExpanding, \ - _MissingPandasLikeRolling, _MissingPandasLikeExpandingGroupby, \ - _MissingPandasLikeRollingGroupby +from databricks.koalas.missing.window import ( + _MissingPandasLikeExpanding, + _MissingPandasLikeRolling, + _MissingPandasLikeExpandingGroupby, + _MissingPandasLikeRollingGroupby, +) from databricks.koalas.series import Series from databricks.koalas.strings import StringMethods from databricks.koalas.window import Expanding, ExpandingGroupby, Rolling, RollingGroupby @@ -60,15 +64,27 @@ def attach(logger_module: Union[str, ModuleType]) -> None: if isinstance(logger_module, str): logger_module = importlib.import_module(logger_module) - logger = getattr(logger_module, 'get_logger')() + logger = getattr(logger_module, "get_logger")() modules = [config, namespace] - classes = [DataFrame, Series, Index, MultiIndex, - DataFrameGroupBy, SeriesGroupBy, DatetimeMethods, StringMethods, - Expanding, ExpandingGroupby, Rolling, RollingGroupby] + classes = [ + DataFrame, + Series, + Index, + MultiIndex, + DataFrameGroupBy, + SeriesGroupBy, + DatetimeMethods, + StringMethods, + Expanding, + ExpandingGroupby, + Rolling, + RollingGroupby, + ] try: from databricks.koalas import mlflow + modules.append(mlflow) classes.append(mlflow.PythonModelWrapper) except ImportError: @@ -79,43 +95,57 @@ def attach(logger_module: Union[str, ModuleType]) -> None: # Modules for target_module in modules: - target_name = target_module.__name__.split('.')[-1] - for name in getattr(target_module, '__all__'): + target_name = target_module.__name__.split(".")[-1] + for name in getattr(target_module, "__all__"): func = getattr(target_module, name) if not inspect.isfunction(func): continue setattr(target_module, name, _wrap_function(target_name, name, func, logger)) - special_functions = set(['__init__', '__repr__', '__str__', '_repr_html_', '__len__', - '__getitem__', '__setitem__', '__getattr__']) + special_functions = set( + [ + "__init__", + "__repr__", + "__str__", + "_repr_html_", + "__len__", + "__getitem__", + "__setitem__", + "__getattr__", + ] + ) # Classes for target_class in classes: for name, func in inspect.getmembers(target_class, inspect.isfunction): - if name.startswith('_') and name not in special_functions: + if name.startswith("_") and name not in special_functions: continue setattr(target_class, name, _wrap_function(target_class.__name__, name, func, logger)) for name, prop in inspect.getmembers(target_class, lambda o: isinstance(o, property)): - if name.startswith('_'): + if name.startswith("_"): continue setattr(target_class, name, _wrap_property(target_class.__name__, name, prop, logger)) # Missings - for original, missing in \ - [(pd.DataFrame, _MissingPandasLikeDataFrame), - (pd.Series, _MissingPandasLikeSeries), - (pd.Index, _MissingPandasLikeIndex), - (pd.MultiIndex, _MissingPandasLikeMultiIndex), - (pd.core.groupby.DataFrameGroupBy, _MissingPandasLikeDataFrameGroupBy), - (pd.core.groupby.SeriesGroupBy, _MissingPandasLikeSeriesGroupBy), - (pd.core.window.Expanding, _MissingPandasLikeExpanding), - (pd.core.window.Rolling, _MissingPandasLikeRolling), - (pd.core.window.ExpandingGroupby, _MissingPandasLikeExpandingGroupby), - (pd.core.window.RollingGroupby, _MissingPandasLikeRollingGroupby)]: + for original, missing in [ + (pd.DataFrame, _MissingPandasLikeDataFrame), + (pd.Series, _MissingPandasLikeSeries), + (pd.Index, _MissingPandasLikeIndex), + (pd.MultiIndex, _MissingPandasLikeMultiIndex), + (pd.core.groupby.DataFrameGroupBy, _MissingPandasLikeDataFrameGroupBy), + (pd.core.groupby.SeriesGroupBy, _MissingPandasLikeSeriesGroupBy), + (pd.core.window.Expanding, _MissingPandasLikeExpanding), + (pd.core.window.Rolling, _MissingPandasLikeRolling), + (pd.core.window.ExpandingGroupby, _MissingPandasLikeExpandingGroupby), + (pd.core.window.RollingGroupby, _MissingPandasLikeRollingGroupby), + ]: for name, func in inspect.getmembers(missing, inspect.isfunction): - setattr(missing, name, - _wrap_missing_function(original.__name__, name, func, original, logger)) + setattr( + missing, + name, + _wrap_missing_function(original.__name__, name, func, original, logger), + ) for name, prop in inspect.getmembers(missing, lambda o: isinstance(o, property)): setattr(missing, name, _wrap_missing_property(original.__name__, name, prop, logger)) @@ -130,7 +160,7 @@ def _wrap_function(class_name, function_name, func, logger): @functools.wraps(func) def wrapper(*args, **kwargs): - if hasattr(_local, 'logging') and _local.logging: + if hasattr(_local, "logging") and _local.logging: # no need to log since this should be internal call. return func(*args, **kwargs) _local.logging = True @@ -139,11 +169,13 @@ def wrapper(*args, **kwargs): try: res = func(*args, **kwargs) logger.log_success( - class_name, function_name, time.perf_counter() - start, signature) + class_name, function_name, time.perf_counter() - start, signature + ) return res except Exception as ex: logger.log_failure( - class_name, function_name, ex, time.perf_counter() - start, signature) + class_name, function_name, ex, time.perf_counter() - start, signature + ) raise finally: _local.logging = False @@ -152,10 +184,9 @@ def wrapper(*args, **kwargs): def _wrap_property(class_name, property_name, prop, logger): - @property def wrapper(self): - if hasattr(_local, 'logging') and _local.logging: + if hasattr(_local, "logging") and _local.logging: # no need to log since this should be internal call. return prop.fget(self) _local.logging = True @@ -163,12 +194,10 @@ def wrapper(self): start = time.perf_counter() try: res = prop.fget(self) - logger.log_success( - class_name, property_name, time.perf_counter() - start) + logger.log_success(class_name, property_name, time.perf_counter() - start) return res except Exception as ex: - logger.log_failure( - class_name, property_name, ex, time.perf_counter() - start) + logger.log_failure(class_name, property_name, ex, time.perf_counter() - start) raise finally: _local.logging = False @@ -188,7 +217,7 @@ def _wrap_missing_function(class_name, function_name, func, original, logger): signature = inspect.signature(getattr(original, function_name)) - is_deprecated = func.__name__ == 'deprecated_function' + is_deprecated = func.__name__ == "deprecated_function" @functools.wraps(func) def wrapper(*args, **kwargs): @@ -202,7 +231,7 @@ def wrapper(*args, **kwargs): def _wrap_missing_property(class_name, property_name, prop, logger): - is_deprecated = prop.fget.__name__ == 'deprecated_property' + is_deprecated = prop.fget.__name__ == "deprecated_property" @property def wrapper(self): diff --git a/databricks/koalas/usage_logging/usage_logger.py b/databricks/koalas/usage_logging/usage_logger.py index 701fe43..97a64c8 100644 --- a/databricks/koalas/usage_logging/usage_logger.py +++ b/databricks/koalas/usage_logging/usage_logger.py @@ -31,8 +31,11 @@ def get_logger() -> Any: def _format_signature(signature): - return ('({})'.format(', '.join([p.name for p in signature.parameters.values()])) - if signature is not None else "") + return ( + "({})".format(", ".join([p.name for p in signature.parameters.values()])) + if signature is not None + else "" + ) class KoalasUsageLogger(object): @@ -47,10 +50,11 @@ class KoalasUsageLogger(object): """ def __init__(self): - self.logger = logging.getLogger('databricks.koalas.usage_logger') + self.logger = logging.getLogger("databricks.koalas.usage_logger") - def log_success(self, class_name: str, name: str, duration: float, - signature: Optional[Signature] = None) -> None: + def log_success( + self, class_name: str, name: str, duration: float, signature: Optional[Signature] = None + ) -> None: """ Log the function or property call is successfully finished. @@ -60,15 +64,26 @@ def log_success(self, class_name: str, name: str, duration: float, :param signature: the signature if the target is a function, else None """ if self.logger.isEnabledFor(logging.INFO): - msg = (('A {function} `{class_name}.{name}{signature}` was successfully finished ' - 'after {duration:.3f} ms.') - .format(class_name=class_name, name=name, signature=_format_signature(signature), - duration=duration * 1000, - function='function' if signature is not None else 'property')) + msg = ( + "A {function} `{class_name}.{name}{signature}` was successfully finished " + "after {duration:.3f} ms." + ).format( + class_name=class_name, + name=name, + signature=_format_signature(signature), + duration=duration * 1000, + function="function" if signature is not None else "property", + ) self.logger.info(msg) - def log_failure(self, class_name: str, name: str, ex: Exception, duration: float, - signature: Optional[Signature] = None) -> None: + def log_failure( + self, + class_name: str, + name: str, + ex: Exception, + duration: float, + signature: Optional[Signature] = None, + ) -> None: """ Log the function or property call failed. @@ -79,16 +94,26 @@ def log_failure(self, class_name: str, name: str, ex: Exception, duration: float :param signature: the signature if the target is a function, else None """ if self.logger.isEnabledFor(logging.WARNING): - msg = (('A {function} `{class_name}.{name}{signature}` was failed ' - 'after {duration:.3f} ms: {msg}') - .format(class_name=class_name, name=name, signature=_format_signature(signature), - msg=_exception_message(ex), - duration=duration * 1000, - function='function' if signature is not None else 'property')) + msg = ( + "A {function} `{class_name}.{name}{signature}` was failed " + "after {duration:.3f} ms: {msg}" + ).format( + class_name=class_name, + name=name, + signature=_format_signature(signature), + msg=_exception_message(ex), + duration=duration * 1000, + function="function" if signature is not None else "property", + ) self.logger.warning(msg) - def log_missing(self, class_name: str, name: str, is_deprecated: bool = False, - signature: Optional[Signature] = None) -> None: + def log_missing( + self, + class_name: str, + name: str, + is_deprecated: bool = False, + signature: Optional[Signature] = None, + ) -> None: """ Log the missing or deprecated function or property is called. @@ -98,8 +123,11 @@ def log_missing(self, class_name: str, name: str, is_deprecated: bool = False, :param signature: the original function signature if the target is a function, else None """ if self.logger.isEnabledFor(logging.INFO): - msg = ('A {deprecated} {function} `{class_name}.{name}{signature}` was called.' - .format(class_name=class_name, name=name, signature=_format_signature(signature), - function='function' if signature is not None else 'property', - deprecated='deprecated' if is_deprecated else 'missing')) + msg = "A {deprecated} {function} `{class_name}.{name}{signature}` was called.".format( + class_name=class_name, + name=name, + signature=_format_signature(signature), + function="function" if signature is not None else "property", + deprecated="deprecated" if is_deprecated else "missing", + ) self.logger.info(msg) diff --git a/databricks/koalas/utils.py b/databricks/koalas/utils.py index 15e660e..76c59f0 100644 --- a/databricks/koalas/utils.py +++ b/databricks/koalas/utils.py @@ -50,8 +50,9 @@ def combine_frames(this, *args, how="full"): from databricks.koalas.config import get_option if all(isinstance(arg, Series) for arg in args): - assert all(arg._kdf is args[0]._kdf for arg in args), \ - "Currently only one different DataFrame (from given Series) is supported" + assert all( + arg._kdf is args[0]._kdf for arg in args + ), "Currently only one different DataFrame (from given Series) is supported" if this is args[0]._kdf: return # We don't need to combine. All series is in this. that = args[0]._kdf[list(args)] @@ -61,8 +62,7 @@ def combine_frames(this, *args, how="full"): return # We don't need to combine. `this` and `that` are same. that = args[0] else: - raise AssertionError("args should be single DataFrame or " - "single/multiple Series") + raise AssertionError("args should be single DataFrame or " "single/multiple Series") if get_option("compute.ops_on_diff_frames"): this_index_map = this._internal.index_map @@ -86,43 +86,58 @@ def combine_frames(this, *args, how="full"): join_scol = this_scol == that_scol join_scols.append(join_scol) merged_index_scols.append( - F.when( - this_scol.isNotNull(), this_scol - ).otherwise(that_scol).alias(this_column)) + F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(this_column) + ) else: raise ValueError("Index names must be exactly matched currently.") assert len(join_scols) > 0, "cannot join with no overlapping index names" - joined_df = this._sdf.alias("this").join( - that._sdf.alias("that"), on=join_scols, how=how) + joined_df = this._sdf.alias("this").join(that._sdf.alias("that"), on=join_scols, how=how) joined_df = joined_df.select( - merged_index_scols + - [this[label]._scol.alias("__this_%s" % this._internal.column_name_for(label)) - for label in this._internal.column_labels] + - [that[label]._scol.alias("__that_%s" % that._internal.column_name_for(label)) - for label in that._internal.column_labels]) + merged_index_scols + + [ + this[label]._scol.alias("__this_%s" % this._internal.column_name_for(label)) + for label in this._internal.column_labels + ] + + [ + that[label]._scol.alias("__that_%s" % that._internal.column_name_for(label)) + for label in that._internal.column_labels + ] + ) index_columns = set(this._internal.index_columns) new_data_columns = [c for c in joined_df.columns if c not in index_columns] level = max(this._internal.column_labels_level, that._internal.column_labels_level) - column_labels = ([tuple(['this'] + ([''] * (level - len(label))) + list(label)) - for label in this._internal.column_labels] - + [tuple(['that'] + ([''] * (level - len(label))) + list(label)) - for label in that._internal.column_labels]) - column_label_names = ((([None] * (1 + level - len(this._internal.column_labels_level))) - + this._internal.column_label_names) - if this._internal.column_label_names is not None else None) + column_labels = [ + tuple(["this"] + ([""] * (level - len(label))) + list(label)) + for label in this._internal.column_labels + ] + [ + tuple(["that"] + ([""] * (level - len(label))) + list(label)) + for label in that._internal.column_labels + ] + column_label_names = ( + ( + ([None] * (1 + level - len(this._internal.column_labels_level))) + + this._internal.column_label_names + ) + if this._internal.column_label_names is not None + else None + ) return DataFrame( - this._internal.copy(sdf=joined_df, - column_labels=column_labels, - column_scols=[scol_for(joined_df, col) for col in new_data_columns], - column_label_names=column_label_names)) + this._internal.copy( + sdf=joined_df, + column_labels=column_labels, + column_scols=[scol_for(joined_df, col) for col in new_data_columns], + column_label_names=column_label_names, + ) + ) else: raise ValueError( "Cannot combine the series or dataframe because it comes from a different dataframe. " - "In order to allow this operation, enable 'compute.ops_on_diff_frames' option.") + "In order to allow this operation, enable 'compute.ops_on_diff_frames' option." + ) def align_diff_frames(resolve_func, this, that, fillna=True, how="full"): @@ -199,15 +214,16 @@ def align_diff_frames(resolve_func, this, that, fillna=True, how="full"): for combined_label in combined_column_labels: for common_label in common_column_labels: - if combined_label == tuple(['this', *common_label]): + if combined_label == tuple(["this", *common_label]): this_columns_to_apply.append(combined_label) break - elif combined_label == tuple(['that', *common_label]): + elif combined_label == tuple(["that", *common_label]): that_columns_to_apply.append(combined_label) break else: - if how == "left" and \ - combined_label in [tuple(['that', *label]) for label in that_column_labels]: + if how == "left" and combined_label in [ + tuple(["that", *label]) for label in that_column_labels + ]: # In this case, we will drop `that_columns` in `columns_to_keep` but passes # it later to `func`. `func` should resolve it. # Note that adding this into a separate list (`additional_that_columns`) @@ -225,8 +241,9 @@ def align_diff_frames(resolve_func, this, that, fillna=True, how="full"): # Should extract columns to apply and do it in a batch in case # it adds new columns for example. if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0: - kser_set, column_labels_applied = \ - zip(*resolve_func(combined, this_columns_to_apply, that_columns_to_apply)) + kser_set, column_labels_applied = zip( + *resolve_func(combined, this_columns_to_apply, that_columns_to_apply) + ) columns_applied = [c._scol for c in kser_set] column_labels_applied = list(column_labels_applied) else: @@ -262,22 +279,27 @@ def align_diff_series(func, this_series, *args, how="full"): cols = [arg for arg in args if isinstance(arg, IndexOpsMixin)] combined = combine_frames(this_series.to_frame(), *cols, how=how) - that_columns = [combined['that'][arg._internal.column_labels[0]]._scol - if isinstance(arg, IndexOpsMixin) else arg for arg in args] + that_columns = [ + combined["that"][arg._internal.column_labels[0]]._scol + if isinstance(arg, IndexOpsMixin) + else arg + for arg in args + ] - scol = func(combined['this'][this_series._internal.column_labels[0]]._scol, - *that_columns) + scol = func(combined["this"][this_series._internal.column_labels[0]]._scol, *that_columns) - return Series(combined._internal.copy(scol=scol, - column_labels=this_series._internal.column_labels), - anchor=combined) + return Series( + combined._internal.copy(scol=scol, column_labels=this_series._internal.column_labels), + anchor=combined, + ) def default_session(conf=None): if conf is None: conf = dict() - if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and \ - LooseVersion(pyspark.__version__) < LooseVersion("3.0"): + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and LooseVersion( + pyspark.__version__ + ) < LooseVersion("3.0"): conf["spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1" conf["spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1" conf["spark.mesos.driverEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1" @@ -291,9 +313,12 @@ def default_session(conf=None): return builder.getOrCreate() -def validate_arguments_and_invoke_function(pobj: Union[pd.DataFrame, pd.Series], - koalas_func: Callable, pandas_func: Callable, - input_args: Dict): +def validate_arguments_and_invoke_function( + pobj: Union[pd.DataFrame, pd.Series], + koalas_func: Callable, + pandas_func: Callable, + input_args: Dict, +): """ Invokes a pandas function. @@ -321,12 +346,12 @@ def validate_arguments_and_invoke_function(pobj: Union[pd.DataFrame, pd.Series], # Makes a copy since whatever passed in is likely created by locals(), and we can't delete # 'self' key from that. args = input_args.copy() - del args['self'] + del args["self"] - if 'kwargs' in args: + if "kwargs" in args: # explode kwargs - kwargs = args['kwargs'] - del args['kwargs'] + kwargs = args["kwargs"] + del args["kwargs"] args = {**args, **kwargs} koalas_params = inspect.signature(koalas_func).parameters @@ -338,10 +363,14 @@ def validate_arguments_and_invoke_function(pobj: Union[pd.DataFrame, pd.Series], del args[param.name] else: raise TypeError( - ("The pandas version [%s] available does not support parameter '%s' " + - "for function '%s'.") % (pd.__version__, param.name, pandas_func.__name__)) - - args['self'] = pobj + ( + "The pandas version [%s] available does not support parameter '%s' " + + "for function '%s'." + ) + % (pd.__version__, param.name, pandas_func.__name__) + ) + + args["self"] = pobj return pandas_func(**args) @@ -351,7 +380,7 @@ def lazy_property(fn): Copied from https://stevenloria.com/lazy-properties/ """ - attr_name = '_lazy_' + fn.__name__ + attr_name = "_lazy_" + fn.__name__ @property @functools.wraps(fn) @@ -365,7 +394,7 @@ def _lazy_property(self): def scol_for(sdf: spark.DataFrame, column_name: str) -> spark.Column: """ Return Spark Column for the given column name. """ - return sdf['`{}`'.format(column_name)] + return sdf["`{}`".format(column_name)] def column_labels_level(column_labels: List[Tuple[str, ...]]) -> int: @@ -400,15 +429,15 @@ def name_like_string(name: Union[str, Tuple]) -> str: name = tuple([str(n) for n in name]) else: name = (str(name),) - return ('(%s)' % ', '.join(name)) if len(name) > 1 else name[0] + return ("(%s)" % ", ".join(name)) if len(name) > 1 else name[0] def validate_axis(axis=0, none_axis=0): """ Check the given axis is valid. """ - if axis not in (0, 1, 'index', 'columns', None): - raise ValueError('No axis named {0}'.format(axis)) + if axis not in (0, 1, "index", "columns", None): + raise ValueError("No axis named {0}".format(axis)) # convert to numeric axis - return {None: none_axis, 'index': 0, 'columns': 1}.get(axis, axis) + return {None: none_axis, "index": 0, "columns": 1}.get(axis, axis) def validate_bool_kwarg(value, arg_name): @@ -422,13 +451,15 @@ def validate_bool_kwarg(value, arg_name): def compare_null_first(left, right, comp): - return ((left.isNotNull() & right.isNotNull() & comp(left, right)) - | (left.isNull() & right.isNotNull())) + return (left.isNotNull() & right.isNotNull() & comp(left, right)) | ( + left.isNull() & right.isNotNull() + ) def compare_null_last(left, right, comp): - return ((left.isNotNull() & right.isNotNull() & comp(left, right)) - | (left.isNotNull() & right.isNull())) + return (left.isNotNull() & right.isNotNull() & comp(left, right)) | ( + left.isNotNull() & right.isNull() + ) def compare_disallow_null(left, right, comp): diff --git a/databricks/koalas/version.py b/databricks/koalas/version.py index e9c50f7..59ea2c0 100644 --- a/databricks/koalas/version.py +++ b/databricks/koalas/version.py @@ -14,4 +14,4 @@ # limitations under the License. # -__version__ = '0.27.0' +__version__ = "0.27.0" diff --git a/databricks/koalas/window.py b/databricks/koalas/window.py index 926a532..e54d32b 100644 --- a/databricks/koalas/window.py +++ b/databricks/koalas/window.py @@ -23,9 +23,12 @@ from databricks.koalas.utils import name_like_string from pyspark.sql import Window from pyspark.sql import functions as F -from databricks.koalas.missing.window import _MissingPandasLikeRolling, \ - _MissingPandasLikeRollingGroupby, _MissingPandasLikeExpanding, \ - _MissingPandasLikeExpandingGroupby +from databricks.koalas.missing.window import ( + _MissingPandasLikeRolling, + _MissingPandasLikeRollingGroupby, + _MissingPandasLikeExpanding, + _MissingPandasLikeExpandingGroupby, +) from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. from databricks.koalas.internal import NATURAL_ORDER_COLUMN_NAME @@ -33,12 +36,12 @@ class _RollingAndExpanding(object): - def __init__(self, window, min_periods): self._window = window # This unbounded Window is later used to handle 'min_periods' for now. self._unbounded_window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween( - Window.unboundedPreceding, Window.currentRow) + Window.unboundedPreceding, Window.currentRow + ) self._min_periods = min_periods def _apply_as_series_or_frame(self, func): @@ -49,28 +52,31 @@ def _apply_as_series_or_frame(self, func): """ raise NotImplementedError( "A class that inherits this class should implement this method " - "to handle the index and columns of output.") + "to handle the index and columns of output." + ) def count(self): def count(scol): return F.count(scol).over(self._window) - if LooseVersion(pd.__version__) >= LooseVersion('1.0.0'): + if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): if isinstance(self, (Expanding, ExpandingGroupby)): + def count_expanding(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, - F.count(scol).over(self._window) + F.count(scol).over(self._window), ).otherwise(F.lit(None)) - return self._apply_as_series_or_frame(count_expanding).astype('float64') - return self._apply_as_series_or_frame(count).astype('float64') + return self._apply_as_series_or_frame(count_expanding).astype("float64") + + return self._apply_as_series_or_frame(count).astype("float64") def sum(self): def sum(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, - F.sum(scol).over(self._window) + F.sum(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(sum) @@ -79,7 +85,7 @@ def min(self): def min(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, - F.min(scol).over(self._window) + F.min(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(min) @@ -88,7 +94,7 @@ def max(self): def max(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, - F.max(scol).over(self._window) + F.max(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(max) @@ -97,7 +103,7 @@ def mean(self): def mean(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, - F.mean(scol).over(self._window) + F.mean(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(mean) @@ -106,7 +112,7 @@ def std(self): def std(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, - F.stddev(scol).over(self._window) + F.stddev(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(std) @@ -115,14 +121,13 @@ def var(self): def var(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, - F.variance(scol).over(self._window) + F.variance(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(var) class Rolling(_RollingAndExpanding): - def __init__(self, kdf_or_kser, window, min_periods=None): from databricks.koalas import DataFrame, Series @@ -141,9 +146,11 @@ def __init__(self, kdf_or_kser, window, min_periods=None): self.kdf_or_kser = kdf_or_kser if not isinstance(kdf_or_kser, (DataFrame, Series)): raise TypeError( - "kdf_or_kser must be a series or dataframe; however, got: %s" % type(kdf_or_kser)) + "kdf_or_kser must be a series or dataframe; however, got: %s" % type(kdf_or_kser) + ) window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween( - Window.currentRow - (self._window_val - 1), Window.currentRow) + Window.currentRow - (self._window_val - 1), Window.currentRow + ) super(Rolling, self).__init__(window, min_periods) @@ -158,7 +165,8 @@ def __getattr__(self, item: str) -> Any: def _apply_as_series_or_frame(self, func): return self.kdf_or_kser._apply_series_op( - lambda kser: kser._with_new_scol(func(kser._scol)).rename(kser.name)) + lambda kser: kser._with_new_scol(func(kser._scol)).rename(kser.name) + ) def count(self): """ @@ -622,7 +630,6 @@ def var(self): class RollingGroupby(Rolling): - def __init__(self, groupby, groupkeys, window, min_periods=None): from databricks.koalas.groupby import SeriesGroupBy from databricks.koalas.groupby import DataFrameGroupBy @@ -634,7 +641,8 @@ def __init__(self, groupby, groupkeys, window, min_periods=None): else: raise TypeError( "groupby must be a SeriesGroupBy or DataFrameGroupBy; " - "however, got: %s" % type(groupby)) + "however, got: %s" % type(groupby) + ) super(RollingGroupby, self).__init__(kdf, window, min_periods) self._groupby = groupby @@ -643,9 +651,11 @@ def __init__(self, groupby, groupkeys, window, min_periods=None): # DataFrame. So, if the given `groupkeys` is a series, they end up with # being a different series. self._window = self._window.partitionBy( - *[F.col(name_like_string(ser.name)) for ser in groupkeys]) + *[F.col(name_like_string(ser.name)) for ser in groupkeys] + ) self._unbounded_window = self._unbounded_window.partitionBy( - *[F.col(name_like_string(ser.name)) for ser in groupkeys]) + *[F.col(name_like_string(ser.name)) for ser in groupkeys] + ) self._groupkeys = groupkeys # Current implementation reuses DataFrameGroupBy implementations for Series as well. self.kdf = self.kdf_or_kser @@ -682,26 +692,25 @@ def _apply_as_series_or_frame(self, func): # given series. This is because, in case of series, we convert it into # DataFrame. So, if the given `groupkeys` is a series, they end up with # being a different series. - F.col( - name_like_string(groupkey.name) - ).alias( + F.col(name_like_string(groupkey.name)).alias( SPARK_INDEX_NAME_FORMAT(len(new_index_scols)) - )) + ) + ) new_index_map.append( - (SPARK_INDEX_NAME_FORMAT(len(new_index_map)), - groupkey._internal.column_labels[0])) + (SPARK_INDEX_NAME_FORMAT(len(new_index_map)), groupkey._internal.column_labels[0]) + ) for new_index_scol, index_name in zip(kdf._internal.index_scols, kdf._internal.index_names): new_index_scols.append( - new_index_scol.alias(SPARK_INDEX_NAME_FORMAT(len(new_index_scols)))) + new_index_scol.alias(SPARK_INDEX_NAME_FORMAT(len(new_index_scols))) + ) new_index_map.append((SPARK_INDEX_NAME_FORMAT(len(new_index_map)), index_name)) applied = [] for column in kdf.columns: applied.append( - kdf[column]._with_new_scol( - func(kdf[column]._scol) - ).rename(kdf[column].name)) + kdf[column]._with_new_scol(func(kdf[column]._scol)).rename(kdf[column].name) + ) # Seems like pandas filters out when grouped key is NA. cond = self._groupkeys[0]._scol.isNotNull() @@ -713,7 +722,8 @@ def _apply_as_series_or_frame(self, func): sdf=sdf, index_map=new_index_map, column_labels=[c._internal.column_labels[0] for c in applied], - column_scols=[scol_for(sdf, c._internal.data_columns[0]) for c in applied]) + column_scols=[scol_for(sdf, c._internal.data_columns[0]) for c in applied], + ) ret = DataFrame(internal) if isinstance(self._groupby, SeriesGroupBy): @@ -1036,7 +1046,6 @@ def var(self): class Expanding(_RollingAndExpanding): - def __init__(self, kdf_or_kser, min_periods=1): from databricks.koalas import DataFrame, Series @@ -1045,9 +1054,11 @@ def __init__(self, kdf_or_kser, min_periods=1): self.kdf_or_kser = kdf_or_kser if not isinstance(kdf_or_kser, (DataFrame, Series)): raise TypeError( - "kdf_or_kser must be a series or dataframe; however, got: %s" % type(kdf_or_kser)) + "kdf_or_kser must be a series or dataframe; however, got: %s" % type(kdf_or_kser) + ) window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween( - Window.unboundedPreceding, Window.currentRow) + Window.unboundedPreceding, Window.currentRow + ) super(Expanding, self).__init__(window, min_periods) def __getattr__(self, item: str) -> Any: @@ -1374,7 +1385,6 @@ def var(self): class ExpandingGroupby(Expanding): - def __init__(self, groupby, groupkeys, min_periods=1): from databricks.koalas.groupby import SeriesGroupBy from databricks.koalas.groupby import DataFrameGroupBy @@ -1386,7 +1396,8 @@ def __init__(self, groupby, groupkeys, min_periods=1): else: raise TypeError( "groupby must be a SeriesGroupBy or DataFrameGroupBy; " - "however, got: %s" % type(groupby)) + "however, got: %s" % type(groupby) + ) super(ExpandingGroupby, self).__init__(kdf, min_periods) self._groupby = groupby @@ -1395,9 +1406,11 @@ def __init__(self, groupby, groupkeys, min_periods=1): # DataFrame. So, if the given `groupkeys` is a series, they end up with # being a different series. self._window = self._window.partitionBy( - *[F.col(name_like_string(ser.name)) for ser in groupkeys]) + *[F.col(name_like_string(ser.name)) for ser in groupkeys] + ) self._unbounded_window = self._window.partitionBy( - *[F.col(name_like_string(ser.name)) for ser in groupkeys]) + *[F.col(name_like_string(ser.name)) for ser in groupkeys] + ) self._groupkeys = groupkeys # Current implementation reuses DataFrameGroupBy implementations for Series as well. self.kdf = self.kdf_or_kser diff --git a/dev/lint-python b/dev/lint-python index ddf343e..5ab35c2 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -29,6 +29,8 @@ SPHINX_BUILD="sphinx-build" MYPY_BUILD="mypy" +BLACK_BUILD="python -m black" + # Sphinx will import Koalas out of the box. Importing Koalas requires to have # PySpark currently. DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" @@ -216,7 +218,7 @@ function mypy_test { local MYPY_REPORT= local MYPY_STATUS= - # Check that the documentation builds acceptably, skip check if sphinx is not installed. + # Skip check if mypy is not installed. if ! hash "$MYPY_BUILD" 2> /dev/null; then echo "The $MYPY_BUILD command was not found. Skipping mypy checks for now." echo @@ -273,6 +275,34 @@ function sphinx_test { popd &> /dev/null } +function black_test { + local BLACK_REPORT= + local BLACK_STATUS= + + # Skip check if black is not installed. + $BLACK_BUILD 2> /dev/null + if [ $? -ne 0 ]; then + echo "The $BLACK_BUILD command was not found. Skipping black checks for now." + echo + return + fi + + echo "starting black test..." + BLACK_REPORT=$( ($BLACK_BUILD databricks --line-length 100 --check ) 2>&1) + BLACK_STATUS=$? + + if [ "$BLACK_STATUS" -ne 0 ]; then + echo "black checks failed:" + echo "$BLACK_REPORT" + echo "Please run 'dev/reformat' script." + echo "$BLACK_STATUS" + exit "$BLACK_STATUS" + else + echo "black checks passed." + echo + fi +} + SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname "${SCRIPT_DIR}")" @@ -281,6 +311,7 @@ pushd "$SPARK_ROOT_DIR" &> /dev/null PYTHON_SOURCE="$(find . -name "*.py")" compile_python_test "$PYTHON_SOURCE" +black_test pycodestyle_test "$PYTHON_SOURCE" flake8_test pydocstyle_test diff --git a/dev/reformat b/dev/reformat new file mode 100755 index 0000000..4bb568a --- /dev/null +++ b/dev/reformat @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +BLACK_BUILD="python -m black" +BLACK_VERSION="19.10b0" +$BLACK_BUILD 2> /dev/null +if [ $? -ne 0 ]; then + echo "The '$BLACK_BUILD' command was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'." + exit 1 +fi + +$BLACK_BUILD databricks --line-length 100 diff --git a/dev/tox.ini b/dev/tox.ini index 38512ec..feb768b 100644 --- a/dev/tox.ini +++ b/dev/tox.ini @@ -14,7 +14,7 @@ # limitations under the License. [pycodestyle] -ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504 +ignore=E203,E226,E231,E241,E305,E402,E722,E731,E741,W503,W504 max-line-length=100 [pydocstyle] ignore=D100,D101,D102,D103,D104,D105,D106,D107,D200,D201,D202,D203,D204,D205,D206,D207,D208,D209,D210,D211,D212,D213,D214,D215,D300,D301,D302,D400,D401,D402,D403,D404,D405,D406,D407,D408,D409,D410,D411,D412,D413,D414 diff --git a/docs/source/development/contributing.rst b/docs/source/development/contributing.rst index d245083..8f92f12 100644 --- a/docs/source/development/contributing.rst +++ b/docs/source/development/contributing.rst @@ -33,7 +33,7 @@ Step-by-step Guide For Code Contributions 4. Implement the functionality, with test cases providing close to 100% statement coverage. Document the functionality. -5. Run existing and new test cases to make sure they still pass. Also run the linter `dev/lint-python`. +5. Run existing and new test cases to make sure they still pass. Also run `dev/reformat` script to reformat Python files by using `Black `_, and run the linter `dev/lint-python`. 6. Build the docs (`make html` in `docs` directory) and verify the docs related to your change look OK. diff --git a/requirements-dev.txt b/requirements-dev.txt index fd246b1..f973287 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -21,6 +21,9 @@ ipython mypy flake8 +# Code formatter +black==19.10b0 + # Test pytest pytest-cov