From 24a6cbf3d18b9dfcc32230dc4b63d57f370630d0 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Wed, 27 Apr 2022 09:52:03 +0800 Subject: [PATCH] [jvm-package] remove the coalesce in barrier mode Barrier mode doesn't allow coalesce operation, which should fail all tests of xgboost4j-spark-gpu, but it doesn't. That's because the test file is extremly small which result in only 1 PartitionedFile and finally bypass the checking. --- .../java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java | 2 +- .../xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala | 11 +++-------- .../xgboost4j/scala/rapids/spark/GpuTestSuite.scala | 12 ++++-------- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java index 1a8608f74845..c6109a236ddc 100644 --- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java +++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java @@ -69,7 +69,7 @@ public void testBooster() throws XGBoostError { .hasHeader().build(); int maxBin = 16; - int round = 100; + int round = 10; //set params Map paramMap = new HashMap() { { diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala index 5176a9cc0106..756b7b54b161 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala +++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala @@ -407,14 +407,9 @@ object GpuPreXGBoost extends PreXGBoostProvider { } private def repartitionInputData(dataFrame: DataFrame, nWorkers: Int): DataFrame = { - // We can't check dataFrame.rdd.getNumPartitions == nWorkers here, since dataFrame.rdd is - // a lazy variable. If we call it here, we will not directly extract RDD[Table] again, - // instead, we will involve Columnar -> Row -> Columnar and decrease the performance - if (nWorkers == 1) { - dataFrame.coalesce(1) - } else { - dataFrame.repartition(nWorkers) - } + // we can't involve any coalesce operation here, since Barrier mode will check + // the RDD patterns which does not allow coalesce. + dataFrame.repartition(nWorkers) } private def repartitionForGroup( diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala index 173ddadb8257..4d82459fa53f 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala +++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala @@ -39,13 +39,8 @@ trait GpuTestSuite extends FunSuite with TmpFolderSuite { def enableCsvConf(): SparkConf = { new SparkConf() - .set(RapidsConf.ENABLE_READ_CSV_DATES.key, "true") - .set(RapidsConf.ENABLE_READ_CSV_BYTES.key, "true") - .set(RapidsConf.ENABLE_READ_CSV_SHORTS.key, "true") - .set(RapidsConf.ENABLE_READ_CSV_INTEGERS.key, "true") - .set(RapidsConf.ENABLE_READ_CSV_LONGS.key, "true") - .set(RapidsConf.ENABLE_READ_CSV_FLOATS.key, "true") - .set(RapidsConf.ENABLE_READ_CSV_DOUBLES.key, "true") + .set("spark.rapids.sql.csv.read.float.enabled", "true") + .set("spark.rapids.sql.csv.read.double.enabled", "true") } def withGpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = { @@ -246,12 +241,13 @@ object SparkSessionHolder extends Logging { Locale.setDefault(Locale.US) val builder = SparkSession.builder() - .master("local[1]") + .master("local[2]") .config("spark.sql.adaptive.enabled", "false") .config("spark.rapids.sql.enabled", "false") .config("spark.rapids.sql.test.enabled", "false") .config("spark.plugins", "com.nvidia.spark.SQLPlugin") .config("spark.rapids.memory.gpu.pooling.enabled", "false") // Disable RMM for unit tests. + .config("spark.sql.files.maxPartitionBytes", "1000") .appName("XGBoost4j-Spark-Gpu unit test") builder.getOrCreate()