[jvm-packages] java.lang.NullPointerException: null at ml.dmlc.xgboost4j.java.Booster.predict #5957

wangfengchao · 2020-07-30T04:05:37Z

NPE exceptions occur when predicted through the JAVA API.

java.lang.NullPointerException: null
at ml.dmlc.xgboost4j.java.Booster.predict(Booster.java:309)
at ml.dmlc.xgboost4j.java.Booster.predict(Booster.java:375)
at com.tuhu.predict.predict.BaseModelPredict.predict(BaseModelPredict.java:71)
at com.tuhu.predict.predict.XgboostFindPageModelPredict.predict(XgboostFindPageModelPredict.java:53)
at com.tuhu.predict.service.impl.MlpFindPageFeatureServiceImpl.featureProcess(MlpFindPageFeatureServiceImpl.java:65)
at com.tuhu.predict.api.controller.MlpFindPageController.recommendPredict(MlpFindPageController.java:49)
at com.tuhu.predict.api.controller.MlpFindPageController$$FastClassBySpringCGLIB$$f694b9ff.invoke()
at org.springframework.cglib.proxy.MethodProxy.invoke(MethodProxy.java:204)
at org.springframework.aop.framework.CglibAopProxy$CglibMethodInvocation.invokeJoinpoint(CglibAopProxy.java:746)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:163)
at org.springframework.aop.framework.adapter.MethodBeforeAdviceInterceptor.invoke(MethodBeforeAdviceInterceptor.java:52)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:174)
at org.springframework.aop.aspectj.AspectJAfterAdvice.invoke(AspectJAfterAdvice.java:47)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:174)
at org.springframework.aop.framework.adapter.AfterReturningAdviceInterceptor.invoke(AfterReturningAdviceInterceptor.java:52)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:174)
at org.springframework.aop.aspectj.AspectJAfterThrowingAdvice.invoke(AspectJAfterThrowingAdvice.java:62)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:174)
at org.springframework.aop.aspectj.MethodInvocationProceedingJoinPoint.proceed(MethodInvocationProceedingJoinPoint.java:88)
at com.tuhu.springcloud.common.annotation.AbstractControllerLogAspect.doAround(AbstractControllerLogAspect.java:104)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.springframework.aop.aspectj.AbstractAspectJAdvice.invokeAdviceMethodWithGivenArgs(AbstractAspectJAdvice.java:644)
at org.springframework.aop.aspectj.AbstractAspectJAdvice.invokeAdviceMethod(AbstractAspectJAdvice.java:633)
at org.springframework.aop.aspectj.AspectJAroundAdvice.invoke(AspectJAroundAdvice.java:70)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:174)
at org.springframework.aop.interceptor.ExposeInvocationInterceptor.invoke(ExposeInvocationInterceptor.java:92)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:185)
at org.springframework.aop.framework.CglibAopProxy$DynamicAdvisedInterceptor.intercept(CglibAopProxy.java:688)
at com.tuhu.predict.api.controller.MlpFindPageController$$EnhancerBySpringCGLIB$$560ed775.recommendPredict()
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.springframework.web.method.support.InvocableHandlerMethod.doInvoke(InvocableHandlerMethod.java:209)
at org.springframework.web.method.support.InvocableHandlerMethod.invokeForRequest(InvocableHandlerMethod.java:136)
at org.springframework.web.servlet.mvc.method.annotation.ServletInvocableHandlerMethod.invokeAndHandle(ServletInvocableHandlerMethod.java:102)
at org.springframework.web.servlet.mvc.method.annotation.RequestMappingHandlerAdapter.invokeHandlerMethod(RequestMappingHandlerAdapter.java:877)
at org.springframework.web.servlet.mvc.method.annotation.RequestMappingHandlerAdapter.handleInternal(RequestMappingHandlerAdapter.java:783)
at org.springframework.web.servlet.mvc.method.AbstractHandlerMethodAdapter.handle(AbstractHandlerMethodAdapter.java:87)
at org.springframework.web.servlet.DispatcherServlet.doDispatch(DispatcherServlet.java:991)
at org.springframework.web.servlet.DispatcherServlet.doService(DispatcherServlet.java:925)
at org.springframework.web.servlet.FrameworkServlet.processRequest(FrameworkServlet.java:974)
at org.springframework.web.servlet.FrameworkServlet.doPost(FrameworkServlet.java:877)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:661)
at org.springframework.web.servlet.FrameworkServlet.service(FrameworkServlet.java:851)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:742)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:231)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.apache.tomcat.websocket.server.WsFilter.doFilter(WsFilter.java:52)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at com.tuhu.soter.starter.filter.SoterDefaultFilter.doFilter(SoterDefaultFilter.java:79)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at com.tuhu.boot.logback.filter.LogFilter.doFilter(LogFilter.java:54)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.springframework.boot.actuate.metrics.web.servlet.WebMvcMetricsFilter.filterAndRecordMetrics(WebMvcMetricsFilter.java:158)
at org.springframework.boot.actuate.metrics.web.servlet.WebMvcMetricsFilter.filterAndRecordMetrics(WebMvcMetricsFilter.java:126)
at org.springframework.boot.actuate.metrics.web.servlet.WebMvcMetricsFilter.doFilterInternal(WebMvcMetricsFilter.java:111)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:107)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.springframework.boot.actuate.web.trace.servlet.HttpTraceFilter.doFilterInternal(HttpTraceFilter.java:90)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:107)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at com.tuhu.boot.common.filter.HeartbeatFilter.doFilter(HeartbeatFilter.java:42)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at com.tuhu.boot.common.filter.MDCFilter.doFilter(MDCFilter.java:47)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.springframework.web.filter.RequestContextFilter.doFilterInternal(RequestContextFilter.java:99)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:107)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.springframework.web.filter.HttpPutFormContentFilter.doFilterInternal(HttpPutFormContentFilter.java:109)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:107)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.springframework.web.filter.HiddenHttpMethodFilter.doFilterInternal(HiddenHttpMethodFilter.java:93)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:107)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.springframework.web.filter.CharacterEncodingFilter.doFilterInternal(CharacterEncodingFilter.java:200)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:107)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:198)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:96)
at org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:496)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:140)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:81)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:87)
at org.apache.catalina.valves.RemoteIpValve.invoke(RemoteIpValve.java:677)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:342)
at org.apache.coyote.http11.Http11Processor.service(Http11Processor.java:803)
at org.apache.coyote.AbstractProcessorLight.process(AbstractProcessorLight.java:66)
at org.apache.coyote.AbstractProtocol$ConnectionHandler.process(AbstractProtocol.java:790)
at org.apache.tomcat.util.net.NioEndpoint$SocketProcessor.doRun(NioEndpoint.java:1468)
at org.apache.tomcat.util.net.SocketProcessorBase.run(SocketProcessorBase.java:49)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at org.apache.tomcat.util.threads.TaskThread$WrappingRunnable.run(TaskThread.java:61)
at java.lang.Thread.run(Thread.java:748)

wangfengchao · 2020-07-30T04:11:06Z

Because the model is trained through Python Sklearn, incompatibilities later occur.To save time, the algorithm team moved the Sklearn trained XGB model one layer over the Python XgBoost package.I wonder if that's what caused it

wangfengchao · 2020-07-30T04:15:22Z

trivialfis · 2020-07-30T05:52:07Z

Which version of XGBoost are you using? Previously we fixed a bug that jvm package doesn't throw exception correctly when prediction fail and continue with an empty prediction buffer.

wangfengchao · 2020-07-30T06:11:05Z

Which version of XGBoost are you using? Previously we fixed a bug that jvm package doesn't throw exception correctly when prediction fail and continue with an empty prediction buffer.

Version 1.0 of the company's algorithm platform is used, and version 0.9.0 of the algorithm project is used because of version compatibility issues.Algorithm colleagues used Python to convert the 1.0 model file to 0.9.0. I wonder if it is caused by this transformation

trivialfis · 2020-07-30T08:21:34Z

I would suggest wait for 1.2 (#5734) and try again, we have some important bug fixes in this release. Also I would suggest using the same or later xgboost version for prediction. XGBoost's binary model is backward compatible, moving forward, JSON based model is recommended.

kohsuke · 2020-10-14T15:35:35Z

I hit the same problem with 1.2.0. So the problem is still here.

ranInc · 2020-10-20T10:40:58Z

I also got the same problem.
I used xgboost4j to create the model.

is there a workaround?

ranInc · 2020-10-20T10:56:42Z

This is a big problem for me, it failed jobs in production.

hcho3 · 2020-10-20T11:07:33Z

@ranInc Are you using the latest version of XGBoost? So far we are not aware of the exact cause of this issue. We will address it on a best-effort basis, and since there's no guarantee as to when the issue could be addressed, I suggest that you investigate an alternative in the meanwhile.

hcho3 · 2020-10-20T11:09:56Z

@ranInc You can help us by providing a small example program we (developers) can run on our own machine.

ranInc · 2020-10-20T11:16:27Z

I am running 1.2.0, the latest jar on maven repository.
The alternative for me is to go back to saprk 2.4.5 and than use xgboost 0.9 - and this is what I am doing now.

For the example: I will try and pinpoint the specific model/data that causing the job to fail later.

ranInc · 2020-10-21T12:15:37Z

Hi,
I found the specific model/data.
I am attaching it in this comment.
xgb1.2_bug.zip

this is how you recreate the bug (keep in mind that if you do not do the repartition here, it works - so it has something to do with amount of data or type of data in each partition):

from pyspark.ml.pipeline import PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame

df = spark.read.parquet("/tmp/6620294785024229130_features").repartition(200).persist()
df.count()

model = PipelineModel.read().load("/tmp/6620294785024229130_model_xg_only")

predictions = model.transform(df)

predictions.persist()
predictions.count()
predictions.show()

ranInc · 2020-10-21T16:27:09Z

Do you have any idea when this can be addressed?
This prevents me from using spark 3.0...

hcho3 · 2020-10-21T16:43:18Z

@ranInc Not yet. We'll let you know when we get around fixing the bug. Also, can you post the code in Scala? I don't think we ever officially supported the use of PySpark with XGBoost.

ranInc · 2020-10-21T16:49:55Z

      import org.apache.spark.ml.{Pipeline, PipelineModel}
      val df = spark.read.parquet("/tmp/6620294785024229130_features").repartition(200).persist()
      df.count()

      val model = PipelineModel.read.load("/tmp/6620294785024229130_model_xg_only")

      val predictions = model.transform(df)

      predictions.persist()
      predictions.count()
      predictions.show()

ranInc · 2020-10-22T10:46:28Z

Another pointer,
it seems that the problem is due to all the features being sent to be predicted are zeros/missing.

ranInc · 2020-10-27T13:29:42Z

I guess no one is working on this?
This basically means xgboost does not work on spark 3 at all.

hcho3 · 2020-10-27T15:02:25Z

Yeah sorry our hands are quite full right now. We'll get around to this issue at some point. I respectfully ask for your patience. Thanks.

hcho3 · 2020-11-21T00:50:32Z

@ranInc I had some time today so I tried running the script you provided here. I have reproduced the java.lang.NullPointerException error.

Strangely, the latest development version (master branch) does not crash the same way. Instead, it produces error

Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage 7.0 (TID 11, d04389c5babb, executor driver): ml.dmlc.xgboost4j.java.XGBoostError: [00:36:10] /workspace/src/learner.cc:1179: Check failed: learner_model_param_.num_feature >= p_fmat->Info().num_col_ (1 vs. 2) : Number of columns does not match number of features in booster.

I'll investigate further.

trivialfis · 2020-11-21T05:42:42Z

I think the error message makes sense now, your input has more features than the model for prediction.

Before the jvm package will continue after xgboost failure, resulting into an empty prediction buffer. I added a check guard recently.

trivialfis · 2020-11-21T05:44:30Z

Just make sure the number of columns in your training dataset is greater than or equal to your prediction dataset.

ranInc · 2020-11-21T10:39:34Z

Hi,
The model was created using the same amount of features.
In spark it uses one vector column and not multiple columns.
In any case the vector's size is always the same, for fiting and prediction - 100% sure of that.

This has something to do with rows with all zero/missing features.
You can see that if you filter from the dataframe the rows with all zero features - it works just fine.

hcho3 · 2020-11-21T19:03:28Z

@ranInc Can you post the full Scala program that generated the model? The error message seems to suggest that your model was trained with a single feature.

ranInc · 2020-11-22T08:58:15Z

I don't think it will help much as the code is very generic and has some propitiatory transformers,
The code itself is mostly pyspark and not scala.

The best way to see that the number of features is not a problem is just to filter out the rows with all zero features and use the model - this works without a problem.
You can also keep all the rows and repartition the dataframe to use one partition, and that also works.

hcho3 · 2020-11-22T09:02:33Z

@ranInc I filtered out rows with zero and still facing the same error (java.lang.NullPointerException):

...
df.na.drop(minNonNulls = 1)
...

Is this not the right way to do it?

I don't think it will help much as the code is very generic and has some propitiatory transformers

I want to see how many features are being used at training and at prediction time. The error message

ml.dmlc.xgboost4j.java.XGBoostError: [00:36:10] /workspace/src/learner.cc:1179: Check failed: learner_model_param_.num_feature >= p_fmat->Info().num_col_ (1 vs. 2) : Number of columns does not match number of features in booster.

suggests that the model was trained with a single feature and prediction is being made with two features.

Right now, I only have access to the data frame and the serialized model you uploaded. I lack insight into what went into the model training and what went wrong, hindering me from troubleshooting the issue any further. If your program has some proprietary information, is it possible to produce a clean example?

ranInc · 2020-11-22T10:02:33Z

no, you can do this:

import org.apache.spark.ml.linalg.{SparseVector, Vector}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{callUDF, col}
.......
 val isVectorAllZeros = (col: Vector) => {
        col match {
          case sparse: SparseVector =>
            if(sparse.indices.isEmpty){
              true
            }else{
              false
            }
          case _ => false
        }
      }

      spark.udf.register("isVectorAllZeros", isVectorAllZeros)
      df = df.withColumn("isEmpty",callUDF("isVectorAllZeros",
        col("features_6620294785024229130"))).where("isEmpty == false")

you can also just re-partition the dataframe like this:

....
df = df.repartition(1)
....

I understand, but the code won't give you much, because it uses VectorAssembler, and you won't be able to know how many features were actually used,
But I am 100% sure it used the same amount of features.

hcho3 · 2020-11-22T10:08:59Z

But I am 100% sure it used the same amount of features.

How did you ensure this, if VectorAssembler causes to have variable number of features?

ranInc · 2020-11-22T10:13:48Z

VectorAssembler always creates the same amount of features, it just needs names of columns to grab from.
The code itself is used to create Thousands of models, so it is very generic and basically gets a list of names to use.

I might be able to run the model creation again and send you the dataframe used for the model - or any other data you need.
That will take time for me though and if you use what I showed before, you will see the model works just fine with 2 features.

hcho3 · 2020-11-22T10:15:31Z

@ranInc Let me ask one more question: is it correct to say that the example data has a sparse column (VectorAssembler) that has at most two features?

ranInc · 2020-11-22T10:18:44Z

No.
VectorAssembler is a Trasformer that grabs multiple columns and puts them in one Vector column.
Vectors are always used for models fiting and predicting in spark.

The example dataframe here has a vector column.
Some rows have sparse, others dense - all have two features.

hcho3 · 2020-11-22T10:22:22Z

@ranInc So all rows have two features, some values are missing and other are not. Got it. I will try your suggestion about filtering empty rows.

As you may have guessed, I'm quite new to Spark ecosystem, so debugging effort may prove to be quite difficult. We are currently in need for more developers who knows more about Spark and Scala programming in general. If you personally know someone who would like to help us improve the JVM package of XGBoost, please do let us know.

hcho3 · 2020-11-22T13:24:45Z

@ranInc I tried filtering empty rows according to your suggestion:

Program A: Example script, without filtering for empty rows

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.{SparseVector, Vector}
import org.apache.spark.sql.functions.{callUDF, col}

object Main extends App {
  val spark = SparkSession
      .builder()
      .appName("XGBoost4J-Spark Pipeline Example")
      .getOrCreate()

  val df = spark.read.parquet("/home/ubuntu/data/6620294785024229130_features").repartition(200).persist()
  df.show()

  val model = PipelineModel.read.load("/home/ubuntu/data/6620294785024229130_model_xg_only")

  val predictions = model.transform(df)

  predictions.persist()
  predictions.count()
  predictions.show()
}

Program B: Example with empty row filtering

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.{SparseVector, Vector}
import org.apache.spark.sql.functions.{callUDF, col}

object Main extends App {
  val spark = SparkSession
      .builder()
      .appName("XGBoost4J-Spark Pipeline Example")
      .getOrCreate()

  val isVectorAllZeros = (col: Vector) => {
    col match {
      case sparse: SparseVector => (sparse.indices.isEmpty)
      case _ => false
    }
  }
  spark.udf.register("isVectorAllZeros", isVectorAllZeros)

  val df = spark.read.parquet("/home/ubuntu/data/6620294785024229130_features").repartition(200).persist()
                .withColumn("isEmpty", callUDF("isVectorAllZeros", col("features_6620294785024229130")))
                .where("isEmpty == false")
  df.show()

  val model = PipelineModel.read.load("/home/ubuntu/data/6620294785024229130_model_xg_only")

  val predictions = model.transform(df)

  predictions.persist()
  predictions.count()
  predictions.show()
}

Some observations

With the stable 1.2.0 release, Program A errors out with java.lang.NullPointerException. Just prior to the NPE, the following warning is displayed in the Spark execution log:

WARNING: /xgboost/src/learner.cc:979: Number of columns does not match number of features in booster. Columns: 0 Features: 1

With the stable 1.2.0 release, Program B successfully completes without error.
With the development version (latest master branch, commit 42d31d9), both Program A and Program B fails with the following error:

[12:44:57] /home/ubuntu/xgblatest/src/learner.cc:1179: Check failed: learner_model_param_.num_feature >= p_fmat->Info().num_col_ (1 vs. 2) : Number of columns does not match number of features in booster.                                                                                                        
Stack trace:                                                                                                                                   
  [bt] (0) /tmp/libxgboost4j14081654332866852928.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x79) [0x7f7ef62c4e19]                             [bt] (1) /tmp/libxgboost4j14081654332866852928.so(xgboost::LearnerImpl::ValidateDMatrix(xgboost::DMatrix*, bool) const+0x20b) [0x7f7ef63f5f0b]                                                                                                                                              
  [bt] (2) /tmp/libxgboost4j14081654332866852928.so(xgboost::LearnerImpl::Predict(std::shared_ptr<xgboost::DMatrix>, bool, xgboost::HostDeviceVector<float>*, unsigned int, bool, bool, bool, bool, bool)+0x3c3) [0x7f7ef6400233]                                                             
  [bt] (3) /tmp/libxgboost4j14081654332866852928.so(XGBoosterPredict+0xec) [0x7f7ef62caa3c]                                                      [bt] (4) /tmp/libxgboost4j14081654332866852928.so(Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredict+0x47) [0x7f7ef62befd7]             
  [bt] (5) [0x7f80908a8270]

which is odd because, according to @ranInc, the model was trained with data with two features.

I built version 1.2.0-SNAPSHOT from the source (commit 71197d1). This time, both Program A and Program B fails with the feature mismatched error (learner_model_param_.num_feature >= p_fmat->Info().num_col_ (1 vs. 2) : Number of columns does not match number of features in booster).
The difference of behavior between the stable 1.2.0 version and 1.2.0-SNAPSHOT was unexpected and made me quite nervous. In particular, the warning message from 1.2.0

WARNING: /xgboost/src/learner.cc:979: Number of columns does not match number of features in booster. Columns: 0 Features: 1

is not found in the 1.2.0 version of the C++ codebase. Instead, the warning is found in the release_1.0.0 branch:

xgboost/src/learner.cc

Lines 972 to 982 in ea6b117

    
           std::string const msg { 
        
             "Number of columns does not match number of features in booster." 
        
           }; 
        
           if (generic_parameters_.validate_features) { 
        
             CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_) << msg; 
        
           } else if (!valid_features) { 
        
             // Remove this and make the equality check fatal once spark can fix all failing tests. 
        
             LOG(WARNING) << msg << " " 
        
                          << "Columns: " << p_fmat->Info().num_col_ << " " 
        
                          << "Features: " << learner_model_param_.num_feature; 
        
           }

So does it mean that the 1.2.0 JAR file on Maven Central has libxgboost4j.so from 1.0.0 ?? 🤯 😱

Indeed, the 1.2.0 JAR file from Maven Central contains libxgboost4j.so that's actually 1.0.0 (!!!). To find out, download xgboost4j_2.12-1.2.0.jar from Maven Central and extract out libxgboost4j.so file. Then run the following Python script to verify the version of the library file:

import ctypes

lib = ctypes.cdll.LoadLibrary('./libxgboost4j.so')

major = ctypes.c_int()
minor = ctypes.c_int()
patch = ctypes.c_int()

lib.XGBoostVersion(ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch))
print((major.value, minor.value, patch.value))  # prints (1, 0, 2), indicating version 1.0.2

The 1.0.0 issue aside, we clearly see that the trained XGBoost model recognized only a single feature (learner_model_param_.num_feature == 1). Maybe the training data had a feature that was 100% empty?? @ranInc

ranInc · 2020-11-22T13:31:13Z

Do you want me to grab the dataframe used to create the model?
If I'll be able to grab it, I think I can create a simple scala code that creates the model.

hcho3 · 2020-11-22T13:33:26Z

@ranInc My suspicion is that one of the two features in the training data consisted entirely of missing values, setting learner_model_param_.num_feature to 1. So yes, seeing the training data will be very helpful.

ranInc · 2020-11-22T13:36:19Z

Alright, I think I'll have it ready by tomorrow.

hcho3 · 2020-11-22T13:47:26Z

Created #6426 to keep track of the issue of mismatched libxgboost4j.so. Here (#5957) let's keep the discussion on why learner_model_param_.num_feature is being set to 1.

ranInc · 2020-11-22T15:00:12Z

It seems you are wrong, training data has no missing values.
on the example code here, instead of relaying on the repartition to reproduce the failure, I instead used only one row (which has only zero features) for prediction.

features_creation.zip

import ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.PipelineModel
import org.apache.spark.sql.DataFrame

val df = spark.read.parquet("/tmp/6620294785024229130_only_features_creation").persist()
df.count()

val regressor = new XGBoostRegressor()
    .setFeaturesCol("features_6620294785024229130")
    .setLabelCol("label_6620294785024229130")
    .setPredictionCol("prediction")
    .setMissing(0.0F)
    .setMaxDepth(3)
    .setNumRound(100)
    .setNumWorkers(1)
    
val pipeline = new Pipeline().setStages(Array(regressor))
val model = pipeline.fit(df)

val pred = spark.read.parquet("/tmp/6620294785024229130_features").persist()
pred.count()
pred.where("account_code == 4011593987").show()
model.transform(pred.where("account_code == 4011593987")).show()

hcho3 · 2020-12-01T20:35:20Z

Thank you for posting the end-to-end example. The end-to-end example produced the NullPointerException on my machine, using the 1.2.0 version of XGBoost4J-Spark. On the other hand, the example runs successfully (no error) when I switched to the 1.2.1 patch version of XGBoost4J-Spark. I also tried 1.3.0-RC1 (available here) and the example also ran successfully.

@ranInc Can you try the 1.2.1 patch release from Maven Central? Also, try 1.3.0-RC1 if you are feeling more adventurous.

hcho3 · 2020-12-02T08:47:35Z

Possibly resolved by #6426

hcho3 · 2020-12-10T21:16:21Z

Closing this for now. Feel free to open a new issue if you run into a problem with 1.3.0 or 1.2.1 (patch) release.

hcho3 changed the title ~~java.lang.NullPointerException: null at ml.dmlc.xgboost4j.java.Booster.predict(Booster.java:309) at ml.dmlc.xgboost4j.java.Booster.predict(Booster.java:375)~~ [jvm-packages] java.lang.NullPointerException: null at ml.dmlc.xgboost4j.java.Booster.predict Nov 22, 2020

hcho3 mentioned this issue Nov 22, 2020

[jvm-packages] xgboost4j_2.12-1.2.0.jar contains wrong version of libxgboost4j.so #6426

Closed

trivialfis closed this as completed Dec 2, 2020

trivialfis reopened this Dec 2, 2020

hcho3 closed this as completed Dec 10, 2020

[jvm-packages] java.lang.NullPointerException: null at ml.dmlc.xgboost4j.java.Booster.predict #5957

[jvm-packages] java.lang.NullPointerException: null at ml.dmlc.xgboost4j.java.Booster.predict #5957

Comments

wangfengchao commented Jul 30, 2020

wangfengchao commented Jul 30, 2020

wangfengchao commented Jul 30, 2020

trivialfis commented Jul 30, 2020

wangfengchao commented Jul 30, 2020

trivialfis commented Jul 30, 2020 • edited

kohsuke commented Oct 14, 2020

ranInc commented Oct 20, 2020 • edited

ranInc commented Oct 20, 2020

hcho3 commented Oct 20, 2020 • edited

hcho3 commented Oct 20, 2020

ranInc commented Oct 20, 2020 • edited

ranInc commented Oct 21, 2020

ranInc commented Oct 21, 2020

hcho3 commented Oct 21, 2020

ranInc commented Oct 21, 2020 • edited

ranInc commented Oct 22, 2020 • edited

ranInc commented Oct 27, 2020

hcho3 commented Oct 27, 2020 • edited

hcho3 commented Nov 21, 2020

trivialfis commented Nov 21, 2020 • edited

trivialfis commented Nov 21, 2020

ranInc commented Nov 21, 2020 • edited

hcho3 commented Nov 21, 2020

ranInc commented Nov 22, 2020 • edited

hcho3 commented Nov 22, 2020 • edited

ranInc commented Nov 22, 2020 • edited by hcho3

hcho3 commented Nov 22, 2020

ranInc commented Nov 22, 2020

hcho3 commented Nov 22, 2020

ranInc commented Nov 22, 2020 • edited

hcho3 commented Nov 22, 2020

hcho3 commented Nov 22, 2020 • edited

Some observations

ranInc commented Nov 22, 2020

hcho3 commented Nov 22, 2020

ranInc commented Nov 22, 2020

hcho3 commented Nov 22, 2020 • edited

ranInc commented Nov 22, 2020 • edited

hcho3 commented Dec 1, 2020 • edited

hcho3 commented Dec 2, 2020

hcho3 commented Dec 10, 2020 • edited

trivialfis commented Jul 30, 2020 •

edited

ranInc commented Oct 20, 2020 •

edited

hcho3 commented Oct 20, 2020 •

edited

ranInc commented Oct 20, 2020 •

edited

ranInc commented Oct 21, 2020 •

edited

ranInc commented Oct 22, 2020 •

edited

hcho3 commented Oct 27, 2020 •

edited

trivialfis commented Nov 21, 2020 •

edited

ranInc commented Nov 21, 2020 •

edited

ranInc commented Nov 22, 2020 •

edited

hcho3 commented Nov 22, 2020 •

edited

ranInc commented Nov 22, 2020 •

edited by hcho3

ranInc commented Nov 22, 2020 •

edited

hcho3 commented Nov 22, 2020 •

edited

hcho3 commented Nov 22, 2020 •

edited

ranInc commented Nov 22, 2020 •

edited

hcho3 commented Dec 1, 2020 •

edited

hcho3 commented Dec 10, 2020 •

edited