Skip to content

Commit

Permalink
Stop using Java8 because we no longer support spark < 3.0 (#5234)
Browse files Browse the repository at this point in the history
* stop using java8

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* remove unused _format_exception

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* use java8 for mleap

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* comment

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* move mleap test

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* try workaround

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* makedirs

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* run setup-java

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>
  • Loading branch information
harupy committed Jan 7, 2022
1 parent 170f5c3 commit d5b0b55
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 36 deletions.
14 changes: 11 additions & 3 deletions .github/workflows/cross-version-tests.yml
Expand Up @@ -103,11 +103,19 @@ jobs:
with:
repository: ${{ github.event.inputs.repository }}
ref: ${{ github.event.inputs.ref }}
- name: Get Java version
id: get-java-version
run: |
if [ "${{ matrix.package }}" = "mleap" ]
then
java_version=8
else
java_version=11
fi
echo "::set-output name=version::$java_version"
- uses: actions/setup-java@v2
with:
# GitHub Actions' Ubuntu 20.04 image uses Java 11 (which is incompatible with Spark 2.4.x) by default:
# https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md#java
java-version: 8
java-version: ${{ steps.get-java-version.outputs.version }}
distribution: "adopt"
- name: Get python version
id: get-python-version
Expand Down
16 changes: 6 additions & 10 deletions .github/workflows/master.yml
Expand Up @@ -57,9 +57,7 @@ jobs:
- uses: actions/checkout@master
- uses: actions/setup-java@v2
with:
# GitHub Actions' Ubuntu 20.04 image uses Java 11 (which is incompatible with Spark 2.4.x) by default:
# https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md#java
java-version: 8
java-version: 11
distribution: 'adopt'
- name: Re-configure dynamic linker run-time bindings for adoptopenjdk-8-hotspot-amd64
run: |
Expand Down Expand Up @@ -197,9 +195,7 @@ jobs:
python-version: 3.6
- uses: actions/setup-java@v2
with:
# GitHub Actions' Ubuntu 20.04 image uses Java 11 (which is incompatible with Spark 2.4.x) by default:
# https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md#java
java-version: 8
java-version: 11
distribution: 'adopt'
- uses: ./.github/actions/cache-pip
- name: Install dependencies
Expand Down Expand Up @@ -245,7 +241,7 @@ jobs:
- name: Set up Java
uses: actions/setup-java@v2
with:
java-version: 8
java-version: 11
distribution: 'adopt'
- name: Install dependencies
run: |
Expand Down Expand Up @@ -314,7 +310,7 @@ jobs:
python-version: 3.6
- uses: actions/setup-java@v2
with:
java-version: 8
java-version: 11
distribution: 'adopt'
- uses: ./.github/actions/cache-pip
- name: Install dependencies
Expand Down Expand Up @@ -342,7 +338,7 @@ jobs:
python-version: 3.6
- uses: actions/setup-java@v2
with:
java-version: 8
java-version: 11
distribution: 'adopt'
- name: Install dependencies
env:
Expand Down Expand Up @@ -387,7 +383,7 @@ jobs:
python-version: 3.6
- uses: actions/setup-java@v2
with:
java-version: 8
java-version: 11
distribution: 'adopt'
- name: Install dependencies
env:
Expand Down
3 changes: 3 additions & 0 deletions mlflow/mleap.py
Expand Up @@ -211,6 +211,9 @@ def add_to_model(mlflow_model, path, spark_model, sample_input):
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql import DataFrame
import mleap.version

# This import statement adds `serializeToBundle` and `deserializeFromBundle` to `Transformer`:
# https://github.com/combust/mleap/blob/37f6f61634798118e2c2eb820ceeccf9d234b810/python/mleap/pyspark/spark_support.py#L32-L33
from mleap.pyspark.spark_support import SimpleSparkSerializer # pylint: disable=unused-import
from py4j.protocol import Py4JError

Expand Down
5 changes: 0 additions & 5 deletions mlflow/spark.py
Expand Up @@ -23,7 +23,6 @@
import posixpath
import re
import shutil
import traceback
import uuid
import yaml

Expand Down Expand Up @@ -71,10 +70,6 @@
_logger = logging.getLogger(__name__)


def _format_exception(ex):
return "".join(traceback.format_exception(type(ex), ex, ex.__traceback__))


def get_default_pip_requirements():
"""
:return: A list of default pip requirements for MLflow Models produced by this flavor.
Expand Down
17 changes: 17 additions & 0 deletions tests/mleap/test_mleap_model_export.py
Expand Up @@ -189,3 +189,20 @@ def test_mleap_module_model_save_with_invalid_sample_input_type_raises_exception
mlflow.spark.save_model(
spark_model=spark_model_iris.model, path=model_path, sample_input=invalid_input
)


@pytest.mark.large
def test_spark_module_model_save_with_mleap_and_unsupported_transformer_raises_exception(
spark_model_iris, model_path
):
class CustomTransformer(JavaModel):
def _transform(self, dataset):
return dataset

unsupported_pipeline = Pipeline(stages=[CustomTransformer()])
unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df)

with pytest.raises(ValueError, match="CustomTransformer"):
mlflow.spark.save_model(
spark_model=unsupported_model, path=model_path, sample_input=spark_model_iris.spark_df
)
35 changes: 17 additions & 18 deletions tests/spark/test_spark_model_export.py
Expand Up @@ -9,12 +9,12 @@
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.wrapper import JavaModel
import pytest
from sklearn import datasets
import shutil
from collections import namedtuple
import yaml
from packaging.version import Version

import mlflow
import mlflow.pyfunc.scoring_server as pyfunc_scoring_server
Expand Down Expand Up @@ -62,6 +62,22 @@ def spark_custom_env(tmpdir):
# other tests.
@pytest.fixture(scope="session", autouse=True)
def spark_context():
if Version(pyspark.__version__) < Version("3.1"):
# A workaround for this issue:
# https://stackoverflow.com/questions/62109276/errorjava-lang-unsupportedoperationexception-for-pyspark-pandas-udf-documenta
spark_home = (
os.environ.get("SPARK_HOME")
if "SPARK_HOME" in os.environ
else os.path.dirname(pyspark.__file__)
)
conf_dir = os.path.join(spark_home, "conf")
os.makedirs(conf_dir, exist_ok=True)
with open(os.path.join(conf_dir, "spark-defaults.conf"), "w") as f:
conf = """
spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"
spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"
"""
f.write(conf)
conf = pyspark.SparkConf()
max_tries = 3
for num_tries in range(max_tries):
Expand Down Expand Up @@ -635,23 +651,6 @@ def test_pyspark_version_is_logged_without_dev_suffix(spark_model_iris):
assert any(x == f"pyspark=={unaffected_version}" for x in pip_deps)


@pytest.mark.large
def test_spark_module_model_save_with_mleap_and_unsupported_transformer_raises_exception(
spark_model_iris, model_path
):
class CustomTransformer(JavaModel):
def _transform(self, dataset):
return dataset

unsupported_pipeline = Pipeline(stages=[CustomTransformer()])
unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df)

with pytest.raises(ValueError, match="CustomTransformer"):
sparkm.save_model(
spark_model=unsupported_model, path=model_path, sample_input=spark_model_iris.spark_df
)


def test_shutil_copytree_without_file_permissions(tmpdir):
src_dir = tmpdir.mkdir("src-dir")
dst_dir = tmpdir.mkdir("dst-dir")
Expand Down

0 comments on commit d5b0b55

Please sign in to comment.