databricks · HyukjinKwon · Feb 21, 2020
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -52,6 +52,10 @@ jobs:
         ./dev/download_travis_dependencies.sh
         sudo apt-get install xclip
         pip install setuptools
+        # Currently PIP with Python 3.5 removes Black in the requirements-dev.txt file
+        # as Black only works with Python 3.6+. This is hacky but we will drop
+        # Python 3.5 soon so it's fine.
+        sed -i '/black/d' requirements-dev.txt
         pip install -r requirements-dev.txt
         pip install pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION
         pip list

diff --git a/.travis.yml b/.travis.yml
@@ -67,8 +67,12 @@ install:
 
   # Test PyPI installation at Python 3.5. This is also because
   # one of the dependency requires Python 3.6 Conda specifically.
+  # Currently PIP with Python 3.5 removes Black in the requirements-dev.txt file
+  # as Black only works with Python 3.6+. This is hacky but we will drop
+  # Python 3.5 soon so it's fine.
   - |
     if [[ $TRAVIS_PYTHON_VERSION == "3.5" ]]; then
+      sed -i '/black/d' requirements-dev.txt && \
       pip install -r requirements-dev.txt && \
       pip install pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION && \
       pip list;

diff --git a/databricks/__init__.py b/databricks/__init__.py
@@ -15,4 +15,4 @@
 #
 
 # https://packaging.python.org/guides/packaging-namespace-packages/#pkgutil-style-namespace-packages
-__path__ = __import__('pkgutil').extend_path(__path__, __name__)  # type: ignore
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)  # type: ignore
diff --git a/databricks/conftest.py b/databricks/conftest.py
@@ -32,9 +32,7 @@
 from databricks.koalas import utils
 
 
-shared_conf = {
-    "spark.sql.shuffle.partitions": "4"
-}
+shared_conf = {"spark.sql.shuffle.partitions": "4"}
 # Initialize Spark session that should be used in doctests or unittests.
 # Delta requires Spark 2.4.2+. See
 # https://github.com/delta-io/delta#compatibility-with-apache-spark-versions.
@@ -48,7 +46,7 @@
     session = utils.default_session(shared_conf)
 
 
-@pytest.fixture(scope='session', autouse=True)
+@pytest.fixture(scope="session", autouse=True)
 def session_termination():
     yield
     # Share one session across all the tests. Repeating starting and stopping sessions and contexts
@@ -58,46 +56,46 @@ def session_termination():
 
 @pytest.fixture(autouse=True)
 def add_ks(doctest_namespace):
-    doctest_namespace['ks'] = koalas
+    doctest_namespace["ks"] = koalas
 
 
 @pytest.fixture(autouse=True)
 def add_pd(doctest_namespace):
     if os.getenv("PANDAS_VERSION", None) is not None:
         assert pd.__version__ == os.getenv("PANDAS_VERSION")
-    doctest_namespace['pd'] = pd
+    doctest_namespace["pd"] = pd
 
 
 @pytest.fixture(autouse=True)
 def add_pa(doctest_namespace):
     if os.getenv("PYARROW_VERSION", None) is not None:
         assert pa.__version__ == os.getenv("PYARROW_VERSION")
-    doctest_namespace['pa'] = pa
+    doctest_namespace["pa"] = pa
 
 
 @pytest.fixture(autouse=True)
 def add_np(doctest_namespace):
-    doctest_namespace['np'] = numpy
+    doctest_namespace["np"] = numpy
 
 
 @pytest.fixture(autouse=True)
 def add_path(doctest_namespace):
     path = tempfile.mkdtemp()
     atexit.register(lambda: shutil.rmtree(path, ignore_errors=True))
-    doctest_namespace['path'] = path
+    doctest_namespace["path"] = path
 
 
 @pytest.fixture(autouse=True)
 def add_db(doctest_namespace):
     db_name = "db%s" % str(uuid.uuid4()).replace("-", "")
     session.sql("CREATE DATABASE %s" % db_name)
     atexit.register(lambda: session.sql("DROP DATABASE IF EXISTS %s CASCADE" % db_name))
-    doctest_namespace['db'] = db_name
+    doctest_namespace["db"] = db_name
 
 
 @pytest.fixture(autouse=os.getenv("KOALAS_USAGE_LOGGER", None) is not None)
 def add_caplog(caplog):
-    with caplog.at_level(logging.INFO, logger='databricks.koalas.usage_logger'):
+    with caplog.at_level(logging.INFO, logger="databricks.koalas.usage_logger"):
         yield
 
 

diff --git a/databricks/koalas/__init__.py b/databricks/koalas/__init__.py
@@ -21,27 +21,33 @@
 
 def assert_pyspark_version():
     import logging
+
     pyspark_ver = None
     try:
         import pyspark
     except ImportError:
-        raise ImportError('Unable to import pyspark - consider doing a pip install with [spark] '
-                          'extra to install pyspark with pip')
+        raise ImportError(
+            "Unable to import pyspark - consider doing a pip install with [spark] "
+            "extra to install pyspark with pip"
+        )
     else:
-        pyspark_ver = getattr(pyspark, '__version__')
-        if pyspark_ver is None or pyspark_ver < '2.4':
+        pyspark_ver = getattr(pyspark, "__version__")
+        if pyspark_ver is None or pyspark_ver < "2.4":
             logging.warning(
-                'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'
-                .format(pyspark_ver if pyspark_ver is not None else '<unknown version>'))
+                'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'.format(
+                    pyspark_ver if pyspark_ver is not None else "<unknown version>"
+                )
+            )
 
 
 assert_pyspark_version()
 
 import pyspark
 import pyarrow
 
-if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and \
-        LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
+if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and LooseVersion(
+    pyspark.__version__
+) < LooseVersion("3.0"):
     # This is required to support PyArrow 0.15 in PySpark versions lower than 3.0.
     # See SPARK-29367.
     os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
@@ -53,10 +59,31 @@ def assert_pyspark_version():
 from databricks.koalas.config import get_option, set_option, reset_option, options
 from databricks.koalas.groupby import NamedAgg
 
-__all__ = ['read_csv', 'read_parquet', 'to_datetime', 'from_pandas',
-           'get_dummies', 'DataFrame', 'Series', 'Index', 'MultiIndex', 'pandas_wraps',
-           'sql', 'range', 'concat', 'melt', 'get_option', 'set_option', 'reset_option',
-           'read_sql_table', 'read_sql_query', 'read_sql', 'options', 'option_context', 'NamedAgg']
+__all__ = [
+    "read_csv",
+    "read_parquet",
+    "to_datetime",
+    "from_pandas",
+    "get_dummies",
+    "DataFrame",
+    "Series",
+    "Index",
+    "MultiIndex",
+    "pandas_wraps",
+    "sql",
+    "range",
+    "concat",
+    "melt",
+    "get_option",
+    "set_option",
+    "reset_option",
+    "read_sql_table",
+    "read_sql_query",
+    "read_sql",
+    "options",
+    "option_context",
+    "NamedAgg",
+]
 
 
 def _auto_patch():
@@ -68,21 +95,29 @@ def _auto_patch():
     if logger_module is not None:
         try:
             from databricks.koalas import usage_logging
+
             usage_logging.attach(logger_module)
         except Exception as e:
             from pyspark.util import _exception_message
-            logger = logging.getLogger('databricks.koalas.usage_logger')
-            logger.warning('Tried to attach usage logger `{}`, but an exception was raised: {}'
-                           .format(logger_module, _exception_message(e)))
+
+            logger = logging.getLogger("databricks.koalas.usage_logger")
+            logger.warning(
+                "Tried to attach usage logger `{}`, but an exception was raised: {}".format(
+                    logger_module, _exception_message(e)
+                )
+            )
 
     # Autopatching is on by default.
     x = os.getenv("SPARK_KOALAS_AUTOPATCH", "true")
     if x.lower() in ("true", "1", "enabled"):
-        logger = logging.getLogger('spark')
-        logger.info("Patching spark automatically. You can disable it by setting "
-                    "SPARK_KOALAS_AUTOPATCH=false in your environment")
+        logger = logging.getLogger("spark")
+        logger.info(
+            "Patching spark automatically. You can disable it by setting "
+            "SPARK_KOALAS_AUTOPATCH=false in your environment"
+        )
 
         from pyspark.sql import dataframe as df
+
         df.DataFrame.to_koalas = DataFrame.to_koalas