Skip to content

Commit

Permalink
[SPARK-39881][PYTHON] Fix erroneous check for black and reenable blac…
Browse files Browse the repository at this point in the history
…k validation

### What changes were proposed in this pull request?

The previously committed check for running black did not actually work and caused code to be committed that does not follow the linter rules. This patch fixes the way we check if black is locally installed and update the `dev/reformat-python` script. In addition, we run the script to fix existing style issues. Similar to the original PR #32779 this patch only applies the black checks on the pandas code.

The black version is updated in this PR because on an empty virtualenv the selected version of click ends up in a conflict due to a underspecified version of click. See psf/black#2964.

### Why are the changes needed?
We have linter rules, we should actually address them.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Manual testing.

Closes #37305 from grundprinzip/black_format.

Lead-authored-by: Martin Grund <grundprinzip@gmail.com>
Co-authored-by: Martin Grund <martin.grund@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
  • Loading branch information
2 people authored and HyukjinKwon committed Aug 1, 2022
1 parent 0044300 commit 8c6c7ae
Show file tree
Hide file tree
Showing 29 changed files with 108 additions and 110 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ jobs:
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
# Jinja2 3.0.0+ causes error when building with Sphinx.
# See also https://issues.apache.org/jira/browse/SPARK-35375.
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==21.12b0'
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
python3.9 -m pip install 'pandas-stubs==1.2.0.53'
- name: Install R linter dependencies and SparkR
run: |
Expand Down
4 changes: 2 additions & 2 deletions dev/lint-python
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,9 @@ function black_test {
local BLACK_STATUS=

# Skip check if black is not installed.
$BLACK_BUILD 2> /dev/null
$PYTHON_EXECUTABLE -c 'import black' &> /dev/null
if [ $? -ne 0 ]; then
echo "The $BLACK_BUILD command was not found. Skipping black checks for now."
echo "The Python library providing 'black' module was not found. Skipping black checks for now."
echo
return
fi
Expand Down
2 changes: 1 addition & 1 deletion dev/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ testpaths = [
[tool.black]
# When changing the version, we have to update
# GitHub workflow version and dev/reformat-python
required-version = "21.12b0"
required-version = "22.6.0"
line-length = 100
target-version = ['py37']
include = '\.pyi?$'
Expand Down
9 changes: 5 additions & 4 deletions dev/reformat-python
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,16 @@
# limitations under the License.

# The current directory of the script.
PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
FWDIR="$( cd "$DIR"/.. && pwd )"
cd "$FWDIR"

BLACK_BUILD="python -m black"
BLACK_VERSION="21.12b0"
$BLACK_BUILD 2> /dev/null
BLACK_BUILD="${PYTHON_EXECUTABLE} -m black"
BLACK_VERSION="22.6.0"
$PYTHON_EXECUTABLE -c 'import black' 2> /dev/null
if [ $? -ne 0 ]; then
echo "The '$BLACK_BUILD' command was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'."
echo "The Python library providing the 'black' module was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'."
exit 1
fi

Expand Down
2 changes: 1 addition & 1 deletion dev/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ jira
PyGithub

# pandas API on Spark Code formatter.
black
black==22.6.0
5 changes: 1 addition & 4 deletions python/pyspark/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,7 @@ def _do_init(
if sys.version_info[:2] < (3, 8):
with warnings.catch_warnings():
warnings.simplefilter("once")
warnings.warn(
"Python 3.7 support is deprecated in Spark 3.4.",
FutureWarning
)
warnings.warn("Python 3.7 support is deprecated in Spark 3.4.", FutureWarning)

# Broadcast's __reduce__ method stores Broadcast instances here.
# This allows other code to determine which Broadcast instances have
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,7 +968,7 @@ class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol):

def __init__(self, *args: Any):
super(_CountVectorizerParams, self).__init__(*args)
self._setDefault(minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False)
self._setDefault(minTF=1.0, minDF=1.0, maxDF=2**63 - 1, vocabSize=1 << 18, binary=False)

@since("1.6.0")
def getMinTF(self) -> float:
Expand Down Expand Up @@ -1077,7 +1077,7 @@ def __init__(
*,
minTF: float = 1.0,
minDF: float = 1.0,
maxDF: float = 2 ** 63 - 1,
maxDF: float = 2**63 - 1,
vocabSize: int = 1 << 18,
binary: bool = False,
inputCol: Optional[str] = None,
Expand All @@ -1099,7 +1099,7 @@ def setParams(
*,
minTF: float = 1.0,
minDF: float = 1.0,
maxDF: float = 2 ** 63 - 1,
maxDF: float = 2**63 - 1,
vocabSize: int = 1 << 18,
binary: bool = False,
inputCol: Optional[str] = None,
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/pandas/data_type_ops/boolean_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@ def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
)
if isinstance(right, numbers.Number):
left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
return left ** right
return left**right
else:
assert isinstance(right, IndexOpsMixin)
left = transform_boolean_operand_to_numeric(left, spark_type=right.spark.data_type)
return left ** right
return left**right

def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
Expand Down Expand Up @@ -217,7 +217,7 @@ def rpow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
if isinstance(right, numbers.Number) and not isinstance(right, bool):
left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
return right ** left
return right**left
else:
raise TypeError(
"Exponentiation can not be applied to %s and the given type." % self.pretty_name
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,14 +955,14 @@ def rmod(self, other: Any) -> "DataFrame":
)

def pow(self, other: Any) -> "DataFrame":
return self ** other
return self**other

pow.__doc__ = _flex_doc_FRAME.format(
desc="Exponential power of series", op_name="**", equiv="dataframe ** other", reverse="rpow"
)

def rpow(self, other: Any) -> "DataFrame":
return other ** self
return other**self

rpow.__doc__ = _flex_doc_FRAME.format(
desc="Exponential power", op_name="**", equiv="other ** dataframe", reverse="pow"
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ def rmod(self, other: Any) -> "Series":
)

def pow(self, other: Any) -> "Series":
return self ** other
return self**other

pow.__doc__ = _flex_doc_SERIES.format(
desc="Exponential power of series",
Expand All @@ -640,7 +640,7 @@ def pow(self, other: Any) -> "Series":
)

def rpow(self, other: Any) -> "Series":
return other ** self
return other**self

rpow.__doc__ = _flex_doc_SERIES.format(
desc="Reverse Exponential power",
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,10 @@ def test_mod(self):

def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1)
self.assertRaises(TypeError, lambda: self.psser**1)

for psser in self.pssers:
self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser**psser)

def test_radd(self):
self.assert_eq(b"1" + self.psser, b"1" + self.pser)
Expand Down Expand Up @@ -128,7 +128,7 @@ def test_rmod(self):

def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.psser)
self.assertRaises(TypeError, lambda: 1 ** self.psser)
self.assertRaises(TypeError, lambda: 1**self.psser)

def test_and(self):
self.assertRaises(TypeError, lambda: self.psser & True)
Expand Down
36 changes: 18 additions & 18 deletions python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ def test_pow(self):

b_pser, b_psser = pdf["bool"], psdf["bool"]
# float is always returned in pandas-on-Spark
self.assert_eq((b_pser ** 1).astype("float"), b_psser ** 1)
self.assert_eq(b_pser ** 0.1, b_psser ** 0.1)
self.assert_eq((b_pser**1).astype("float"), b_psser**1)
self.assert_eq(b_pser**0.1, b_psser**0.1)
self.assert_eq(b_pser ** b_pser.astype(float), b_psser ** b_psser.astype(float))
self.assertRaises(TypeError, lambda: b_psser ** b_psser)
self.assertRaises(TypeError, lambda: b_psser ** True)
self.assertRaises(TypeError, lambda: b_psser**b_psser)
self.assertRaises(TypeError, lambda: b_psser**True)

self.assert_eq(b_pser % pdf["float"], b_psser % psdf["float"])
for col in self.non_numeric_df_cols:
Expand Down Expand Up @@ -226,10 +226,10 @@ def test_rpow(self):

b_pser, b_psser = pdf["bool"], psdf["bool"]
# float is returned always in pandas-on-Spark
self.assert_eq((1 ** b_pser).astype(float), 1 ** b_psser)
self.assert_eq(0.1 ** b_pser, 0.1 ** b_psser)
self.assert_eq((1**b_pser).astype(float), 1**b_psser)
self.assert_eq(0.1**b_pser, 0.1**b_psser)
self.assertRaises(TypeError, lambda: "x" ** b_psser)
self.assertRaises(TypeError, lambda: True ** b_psser)
self.assertRaises(TypeError, lambda: True**b_psser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) ** b_psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) ** b_psser)

Expand Down Expand Up @@ -547,19 +547,19 @@ def test_pow(self):
pser, psser = pdf["this"], psdf["this"]
# float is always returned in pandas-on-Spark
if extension_float_dtypes_available:
self.check_extension((pser ** 1).astype("Float64"), psser ** 1)
self.check_extension((pser ** 0.1).astype("Float64"), psser ** 0.1)
self.check_extension((pser**1).astype("Float64"), psser**1)
self.check_extension((pser**0.1).astype("Float64"), psser**0.1)
self.check_extension(
(pser ** pser.astype(float)).astype("Float64"), psser ** psser.astype(float)
)
else:
self.assert_eq((pser ** 1).astype("float"), psser ** 1)
self.assert_eq((pser ** 0.1).astype("float"), psser ** 0.1)
self.assert_eq((pser**1).astype("float"), psser**1)
self.assert_eq((pser**0.1).astype("float"), psser**0.1)
self.assert_eq(
(pser ** pser.astype(float)).astype("float"), psser ** psser.astype(float)
)
self.assertRaises(TypeError, lambda: psser ** psser)
self.assertRaises(TypeError, lambda: psser ** True)
self.assertRaises(TypeError, lambda: psser**psser)
self.assertRaises(TypeError, lambda: psser**True)

self.assert_eq(
pser ** pdf["float"],
Expand Down Expand Up @@ -648,13 +648,13 @@ def test_rfloordiv(self):
def test_rpow(self):
pser, psser = self.boolean_pdf["this"], self.boolean_psdf["this"]
if extension_float_dtypes_available:
self.check_extension(pd.Series([1, 1, 1], dtype="Float64", name=psser.name), 1 ** psser)
self.check_extension((0.1 ** pser).astype("Float64"), 0.1 ** psser)
self.check_extension(pd.Series([1, 1, 1], dtype="Float64", name=psser.name), 1**psser)
self.check_extension((0.1**pser).astype("Float64"), 0.1**psser)
else:
self.assert_eq(pd.Series([1, 1, 1], dtype="float", name=psser.name), 1 ** psser)
self.assert_eq((0.1 ** pser).astype("float"), 0.1 ** psser)
self.assert_eq(pd.Series([1, 1, 1], dtype="float", name=psser.name), 1**psser)
self.assert_eq((0.1**pser).astype("float"), 0.1**psser)
self.assertRaises(TypeError, lambda: "x" ** psser)
self.assertRaises(TypeError, lambda: True ** psser)
self.assertRaises(TypeError, lambda: True**psser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) ** psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) ** psser)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,11 @@ def test_mod(self):

def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1)
self.assertRaises(TypeError, lambda: self.psser**1)

with option_context("compute.ops_on_diff_frames", True):
for psser in self.pssers:
self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser**psser)

def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.psser)
Expand All @@ -142,7 +142,7 @@ def test_rmod(self):

def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.psser)
self.assertRaises(TypeError, lambda: 1 ** self.psser)
self.assertRaises(TypeError, lambda: 1**self.psser)

def test_and(self):
self.assertRaises(TypeError, lambda: self.psser & True)
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_mod(self):

def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1)
self.assertRaises(TypeError, lambda: self.psser**1)

psdf = self.array_psdf
for col in self.array_df_cols:
Expand Down Expand Up @@ -215,7 +215,7 @@ def test_rmod(self):

def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.psser)
self.assertRaises(TypeError, lambda: 1 ** self.psser)
self.assertRaises(TypeError, lambda: 1**self.psser)

def test_and(self):
self.assertRaises(TypeError, lambda: self.psser & True)
Expand Down
10 changes: 5 additions & 5 deletions python/pyspark/pandas/tests/data_type_ops/test_date_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,11 @@ def test_mod(self):

def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1)
self.assertRaises(TypeError, lambda: self.psser ** self.some_date)
self.assertRaises(TypeError, lambda: self.psser**1)
self.assertRaises(TypeError, lambda: self.psser**self.some_date)

for psser in self.pssers:
self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser**psser)

def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.psser)
Expand Down Expand Up @@ -151,8 +151,8 @@ def test_rmod(self):

def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.psser)
self.assertRaises(TypeError, lambda: 1 ** self.psser)
self.assertRaises(TypeError, lambda: self.some_date ** self.psser)
self.assertRaises(TypeError, lambda: 1**self.psser)
self.assertRaises(TypeError, lambda: self.some_date**self.psser)

def test_and(self):
self.assertRaises(TypeError, lambda: self.psser & True)
Expand Down
10 changes: 5 additions & 5 deletions python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,11 @@ def test_mod(self):

def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1)
self.assertRaises(TypeError, lambda: self.psser ** self.some_datetime)
self.assertRaises(TypeError, lambda: self.psser**1)
self.assertRaises(TypeError, lambda: self.psser**self.some_datetime)

for psser in self.pssers:
self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser**psser)

def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.psser)
Expand Down Expand Up @@ -155,8 +155,8 @@ def test_rmod(self):

def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.psser)
self.assertRaises(TypeError, lambda: 1 ** self.psser)
self.assertRaises(TypeError, lambda: self.some_datetime ** self.psser)
self.assertRaises(TypeError, lambda: 1**self.psser)
self.assertRaises(TypeError, lambda: self.some_datetime**self.psser)

def test_and(self):
self.assertRaises(TypeError, lambda: self.psser & True)
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/pandas/tests/data_type_ops/test_null_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ def test_mod(self):

def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1)
self.assertRaises(TypeError, lambda: self.psser**1)

for psser in self.pssers:
self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser**psser)

def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.psser)
Expand All @@ -105,7 +105,7 @@ def test_rmod(self):

def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.psser)
self.assertRaises(TypeError, lambda: 1 ** self.psser)
self.assertRaises(TypeError, lambda: 1**self.psser)

def test_from_to_pandas(self):
data = [None, None, None]
Expand Down
14 changes: 7 additions & 7 deletions python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,12 @@ def test_pow(self):
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
if col in ["float", "float_w_nan"]:
self.assert_eq(pser ** pser, psser ** psser)
self.assert_eq(pser**pser, psser**psser)
self.assert_eq(pser ** pser.astype(bool), psser ** psser.astype(bool))
self.assert_eq(pser ** True, psser ** True)
self.assert_eq(pser ** False, psser ** False)
self.assert_eq(pser ** 1, psser ** 1)
self.assert_eq(pser ** 0, psser ** 0)
self.assert_eq(pser**True, psser**True)
self.assert_eq(pser**False, psser**False)
self.assert_eq(pser**1, psser**1)
self.assert_eq(pser**0, psser**0)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
Expand Down Expand Up @@ -243,8 +243,8 @@ def test_rpow(self):
# self.assert_eq(1 ** pser, 1 ** psser)
# self.assert_eq(0.1 ** pser, 0.1 ** psser)
self.assertRaises(TypeError, lambda: "x" ** psser)
self.assert_eq((True ** pser).astype(float), True ** psser)
self.assert_eq((False ** pser).astype(float), False ** psser)
self.assert_eq((True**pser).astype(float), True**psser)
self.assert_eq((False**pser).astype(float), False**psser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) ** psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) ** psser)

Expand Down

0 comments on commit 8c6c7ae

Please sign in to comment.