From 6f1b8f980764da9047cf5af241b246c20da9bd07 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 7 Jan 2022 19:44:27 +0900 Subject: [PATCH 1/8] stop using java8 Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- .github/workflows/cross-version-tests.yml | 6 ----- .github/workflows/master.yml | 29 ----------------------- 2 files changed, 35 deletions(-) diff --git a/.github/workflows/cross-version-tests.yml b/.github/workflows/cross-version-tests.yml index 52441ff0c9af1..dda16343d16aa 100644 --- a/.github/workflows/cross-version-tests.yml +++ b/.github/workflows/cross-version-tests.yml @@ -103,12 +103,6 @@ jobs: with: repository: ${{ github.event.inputs.repository }} ref: ${{ github.event.inputs.ref }} - - uses: actions/setup-java@v2 - with: - # GitHub Actions' Ubuntu 20.04 image uses Java 11 (which is incompatible with Spark 2.4.x) by default: - # https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md#java - java-version: 8 - distribution: "adopt" - name: Get python version id: get-python-version run: | diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index f379f318b7bc2..f3481db373631 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -55,12 +55,6 @@ jobs: # deleting other version(s) of JDK because they are not needed and they might interfere with JNI linker configuration in the 'setup-r' step sudo apt-get -y remove --purge default-jdk adoptopenjdk-11-hotspot || : - uses: actions/checkout@master - - uses: actions/setup-java@v2 - with: - # GitHub Actions' Ubuntu 20.04 image uses Java 11 (which is incompatible with Spark 2.4.x) by default: - # https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md#java - java-version: 8 - distribution: 'adopt' - name: Re-configure dynamic linker run-time bindings for adoptopenjdk-8-hotspot-amd64 run: | sudo mkdir -p /etc/ld.so.conf.d @@ -195,12 +189,6 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 - - uses: actions/setup-java@v2 - with: - # GitHub Actions' Ubuntu 20.04 image uses Java 11 (which is incompatible with Spark 2.4.x) by default: - # https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md#java - java-version: 8 - distribution: 'adopt' - uses: ./.github/actions/cache-pip - name: Install dependencies env: @@ -242,11 +230,6 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 - - name: Set up Java - uses: actions/setup-java@v2 - with: - java-version: 8 - distribution: 'adopt' - name: Install dependencies run: | source ./dev/install-common-deps.sh @@ -312,10 +295,6 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 - - uses: actions/setup-java@v2 - with: - java-version: 8 - distribution: 'adopt' - uses: ./.github/actions/cache-pip - name: Install dependencies env: @@ -340,10 +319,6 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 - - uses: actions/setup-java@v2 - with: - java-version: 8 - distribution: 'adopt' - name: Install dependencies env: INSTALL_SMALL_PYTHON_DEPS: true @@ -385,10 +360,6 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 - - uses: actions/setup-java@v2 - with: - java-version: 8 - distribution: 'adopt' - name: Install dependencies env: INSTALL_LARGE_PYTHON_DEPS: true From a9bce56d157d3dee281561c54a74fab375dc2c05 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 7 Jan 2022 19:47:03 +0900 Subject: [PATCH 2/8] remove unused _format_exception Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- mlflow/spark.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mlflow/spark.py b/mlflow/spark.py index 71ef06cd13335..c96626d2d2ee3 100644 --- a/mlflow/spark.py +++ b/mlflow/spark.py @@ -23,7 +23,6 @@ import posixpath import re import shutil -import traceback import uuid import yaml @@ -71,10 +70,6 @@ _logger = logging.getLogger(__name__) -def _format_exception(ex): - return "".join(traceback.format_exception(type(ex), ex, ex.__traceback__)) - - def get_default_pip_requirements(): """ :return: A list of default pip requirements for MLflow Models produced by this flavor. From 6ecdfeb6a17362ad416124599608c332833afad0 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 7 Jan 2022 19:58:25 +0900 Subject: [PATCH 3/8] use java8 for mleap Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- .github/workflows/cross-version-tests.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/cross-version-tests.yml b/.github/workflows/cross-version-tests.yml index dda16343d16aa..a39dce0973ebb 100644 --- a/.github/workflows/cross-version-tests.yml +++ b/.github/workflows/cross-version-tests.yml @@ -103,6 +103,13 @@ jobs: with: repository: ${{ github.event.inputs.repository }} ref: ${{ github.event.inputs.ref }} + - uses: actions/setup-java@v2 + if: ${{ matrix.package == 'mleap' }} + with: + # GitHub Actions' Ubuntu 20.04 image uses Java 11 (which is incompatible with Spark 2.4.x) by default: + # https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md#java + java-version: 8 + distribution: "adopt" - name: Get python version id: get-python-version run: | From 5ce1e8b538d444a7e3b95555dc23aba2df721181 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 7 Jan 2022 20:03:07 +0900 Subject: [PATCH 4/8] comment Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- mlflow/mleap.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mlflow/mleap.py b/mlflow/mleap.py index 99429be8250cd..c99aecabc3034 100644 --- a/mlflow/mleap.py +++ b/mlflow/mleap.py @@ -211,6 +211,9 @@ def add_to_model(mlflow_model, path, spark_model, sample_input): from pyspark.ml.pipeline import PipelineModel from pyspark.sql import DataFrame import mleap.version + + # This import statement adds `serializeToBundle` and `deserializeFromBundle` to `Transformer`: + # https://github.com/combust/mleap/blob/37f6f61634798118e2c2eb820ceeccf9d234b810/python/mleap/pyspark/spark_support.py#L32-L33 from mleap.pyspark.spark_support import SimpleSparkSerializer # pylint: disable=unused-import from py4j.protocol import Py4JError From 107de7f260a89a1dbec86a1e4a54ba63b5410e6e Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 7 Jan 2022 20:06:39 +0900 Subject: [PATCH 5/8] move mleap test Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- tests/mleap/test_mleap_model_export.py | 17 +++++++++++++++++ tests/spark/test_spark_model_export.py | 18 ------------------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/tests/mleap/test_mleap_model_export.py b/tests/mleap/test_mleap_model_export.py index e4da9050c0096..927cc79ee8a05 100644 --- a/tests/mleap/test_mleap_model_export.py +++ b/tests/mleap/test_mleap_model_export.py @@ -189,3 +189,20 @@ def test_mleap_module_model_save_with_invalid_sample_input_type_raises_exception mlflow.spark.save_model( spark_model=spark_model_iris.model, path=model_path, sample_input=invalid_input ) + + +@pytest.mark.large +def test_spark_module_model_save_with_mleap_and_unsupported_transformer_raises_exception( + spark_model_iris, model_path +): + class CustomTransformer(JavaModel): + def _transform(self, dataset): + return dataset + + unsupported_pipeline = Pipeline(stages=[CustomTransformer()]) + unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df) + + with pytest.raises(ValueError, match="CustomTransformer"): + mlflow.spark.save_model( + spark_model=unsupported_model, path=model_path, sample_input=spark_model_iris.spark_df + ) diff --git a/tests/spark/test_spark_model_export.py b/tests/spark/test_spark_model_export.py index 8b4d95edb8407..08aac52f58c48 100644 --- a/tests/spark/test_spark_model_export.py +++ b/tests/spark/test_spark_model_export.py @@ -9,7 +9,6 @@ from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import VectorAssembler from pyspark.ml.pipeline import Pipeline -from pyspark.ml.wrapper import JavaModel import pytest from sklearn import datasets import shutil @@ -635,23 +634,6 @@ def test_pyspark_version_is_logged_without_dev_suffix(spark_model_iris): assert any(x == f"pyspark=={unaffected_version}" for x in pip_deps) -@pytest.mark.large -def test_spark_module_model_save_with_mleap_and_unsupported_transformer_raises_exception( - spark_model_iris, model_path -): - class CustomTransformer(JavaModel): - def _transform(self, dataset): - return dataset - - unsupported_pipeline = Pipeline(stages=[CustomTransformer()]) - unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df) - - with pytest.raises(ValueError, match="CustomTransformer"): - sparkm.save_model( - spark_model=unsupported_model, path=model_path, sample_input=spark_model_iris.spark_df - ) - - def test_shutil_copytree_without_file_permissions(tmpdir): src_dir = tmpdir.mkdir("src-dir") dst_dir = tmpdir.mkdir("dst-dir") From 52ad3d290123adea97f06e9400985118345030f6 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 7 Jan 2022 20:50:17 +0900 Subject: [PATCH 6/8] try workaround Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- tests/spark/test_spark_model_export.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/spark/test_spark_model_export.py b/tests/spark/test_spark_model_export.py index 08aac52f58c48..f52f71622a38f 100644 --- a/tests/spark/test_spark_model_export.py +++ b/tests/spark/test_spark_model_export.py @@ -14,6 +14,7 @@ import shutil from collections import namedtuple import yaml +from packaging.version import Version import mlflow import mlflow.pyfunc.scoring_server as pyfunc_scoring_server @@ -61,6 +62,16 @@ def spark_custom_env(tmpdir): # other tests. @pytest.fixture(scope="session", autouse=True) def spark_context(): + if Version(pyspark.__version__) < Version("3.1"): + # A workaround for this issue: + # https://stackoverflow.com/questions/62109276/errorjava-lang-unsupportedoperationexception-for-pyspark-pandas-udf-documenta + conf_path = os.path.join(os.path.dirname(pyspark.__file__), "conf/spark-defaults.conf") + with open(conf_path, "w") as f: + conf = """ +spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" +spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" +""" + f.write(conf) conf = pyspark.SparkConf() max_tries = 3 for num_tries in range(max_tries): From e90cdf58aac7a934799aca7b55451c3632bf14f8 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 7 Jan 2022 21:11:17 +0900 Subject: [PATCH 7/8] makedirs Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- tests/spark/test_spark_model_export.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/spark/test_spark_model_export.py b/tests/spark/test_spark_model_export.py index f52f71622a38f..17c29f6e2770b 100644 --- a/tests/spark/test_spark_model_export.py +++ b/tests/spark/test_spark_model_export.py @@ -65,8 +65,14 @@ def spark_context(): if Version(pyspark.__version__) < Version("3.1"): # A workaround for this issue: # https://stackoverflow.com/questions/62109276/errorjava-lang-unsupportedoperationexception-for-pyspark-pandas-udf-documenta - conf_path = os.path.join(os.path.dirname(pyspark.__file__), "conf/spark-defaults.conf") - with open(conf_path, "w") as f: + spark_home = ( + os.environ.get("SPARK_HOME") + if "SPARK_HOME" in os.environ + else os.path.dirname(pyspark.__file__) + ) + conf_dir = os.path.join(spark_home, "conf") + os.makedirs(conf_dir, exist_ok=True) + with open(os.path.join(conf_dir, "spark-defaults.conf"), "w") as f: conf = """ spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" From 927e97e01b67b1cbdcda403419e636c3409ebb42 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 7 Jan 2022 21:32:52 +0900 Subject: [PATCH 8/8] run setup-java Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- .github/workflows/cross-version-tests.yml | 15 ++++++++++---- .github/workflows/master.yml | 25 +++++++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cross-version-tests.yml b/.github/workflows/cross-version-tests.yml index a39dce0973ebb..5dfb6e7c4f716 100644 --- a/.github/workflows/cross-version-tests.yml +++ b/.github/workflows/cross-version-tests.yml @@ -103,12 +103,19 @@ jobs: with: repository: ${{ github.event.inputs.repository }} ref: ${{ github.event.inputs.ref }} + - name: Get Java version + id: get-java-version + run: | + if [ "${{ matrix.package }}" = "mleap" ] + then + java_version=8 + else + java_version=11 + fi + echo "::set-output name=version::$java_version" - uses: actions/setup-java@v2 - if: ${{ matrix.package == 'mleap' }} with: - # GitHub Actions' Ubuntu 20.04 image uses Java 11 (which is incompatible with Spark 2.4.x) by default: - # https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md#java - java-version: 8 + java-version: ${{ steps.get-java-version.outputs.version }} distribution: "adopt" - name: Get python version id: get-python-version diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index f3481db373631..f0d6a309f4665 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -55,6 +55,10 @@ jobs: # deleting other version(s) of JDK because they are not needed and they might interfere with JNI linker configuration in the 'setup-r' step sudo apt-get -y remove --purge default-jdk adoptopenjdk-11-hotspot || : - uses: actions/checkout@master + - uses: actions/setup-java@v2 + with: + java-version: 11 + distribution: 'adopt' - name: Re-configure dynamic linker run-time bindings for adoptopenjdk-8-hotspot-amd64 run: | sudo mkdir -p /etc/ld.so.conf.d @@ -189,6 +193,10 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 + - uses: actions/setup-java@v2 + with: + java-version: 11 + distribution: 'adopt' - uses: ./.github/actions/cache-pip - name: Install dependencies env: @@ -230,6 +238,11 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 + - name: Set up Java + uses: actions/setup-java@v2 + with: + java-version: 11 + distribution: 'adopt' - name: Install dependencies run: | source ./dev/install-common-deps.sh @@ -295,6 +308,10 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 + - uses: actions/setup-java@v2 + with: + java-version: 11 + distribution: 'adopt' - uses: ./.github/actions/cache-pip - name: Install dependencies env: @@ -319,6 +336,10 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 + - uses: actions/setup-java@v2 + with: + java-version: 11 + distribution: 'adopt' - name: Install dependencies env: INSTALL_SMALL_PYTHON_DEPS: true @@ -360,6 +381,10 @@ jobs: - uses: ./.github/actions/setup-python with: python-version: 3.6 + - uses: actions/setup-java@v2 + with: + java-version: 11 + distribution: 'adopt' - name: Install dependencies env: INSTALL_LARGE_PYTHON_DEPS: true