microsoft · shiyu1994 · Mar 23, 2022 · Apr 20, 2021 · Apr 22, 2021 · Apr 29, 2021
@@ -80,7 +80,7 @@ else  # Linux
         mv $AMDAPPSDK_PATH/lib/x86_64/sdk/* $AMDAPPSDK_PATH/lib/x86_64/
         echo libamdocl64.so > $OPENCL_VENDOR_PATH/amdocl64.icd
     fi
-    if [[ $TASK == "cuda" ]]; then
+    if [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then
         echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
         apt-get update
         apt-get install --no-install-recommends -y \

@@ -190,21 +190,41 @@ if [[ $TASK == "gpu" ]]; then
     elif [[ $METHOD == "source" ]]; then
         cmake -DUSE_GPU=ON -DOpenCL_INCLUDE_DIR=$AMDAPPSDK_PATH/include/ ..
     fi
-elif [[ $TASK == "cuda" ]]; then
-    sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
-    grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
+elif [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then
+    if [[ $TASK == "cuda" ]]; then
+        sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
+        grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
+    else
+        sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda_exp";/' $BUILD_DIRECTORY/include/LightGBM/config.h
+        grep -q 'std::string device_type = "cuda_exp"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
+        # by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests
+        sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h
+        grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
 // force gpu_use_dp for CUDA 
 if (device_type == std::string("cuda") && !gpu_use_dp) { 
   Log::Warning("CUDA currently requires double precision calculations."); 
   gpu_use_dp = true; 
 } 
 // force gpu_use_dp for CUDA 
 if (device_type == std::string("cuda") && !gpu_use_dp) { 
   Log::Warning("CUDA currently requires double precision calculations."); 
   gpu_use_dp = true; 
 } 
+    fi
     if [[ $METHOD == "pip" ]]; then
         cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
-        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
+        if [[ $TASK == "cuda" ]]; then
+            pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
+        else
+            pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda-exp || exit -1
+        fi
         pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
         exit 0
     elif [[ $METHOD == "wheel" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1
+        if [[ $TASK == "cuda" ]]; then
+            cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1
+        else
+            cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda-exp || exit -1
+        fi
         pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1
         pytest $BUILD_DIRECTORY/tests || exit -1
         exit 0
     elif [[ $METHOD == "source" ]]; then
-        cmake -DUSE_CUDA=ON ..
+        if [[ $TASK == "cuda" ]]; then
+            cmake -DUSE_CUDA=ON ..
+        else
+            cmake -DUSE_CUDA_EXP=ON ..
+        fi
     fi
 elif [[ $TASK == "mpi" ]]; then
     if [[ $METHOD == "pip" ]]; then

@@ -16,7 +16,7 @@ env:
 
 jobs:
   test:
-    name: cuda ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }})
+    name: ${{ matrix.tree_learner }} ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }})
     runs-on: [self-hosted, linux]
     timeout-minutes: 60
     strategy:
@@ -27,14 +27,27 @@ jobs:
             compiler: gcc
             python_version: "3.8"
             cuda_version: "11.5.1"
+            tree_learner: cuda
           - method: pip
             compiler: clang
             python_version: "3.9"
             cuda_version: "10.0"
+            tree_learner: cuda
           - method: wheel
             compiler: gcc
             python_version: "3.10"
             cuda_version: "9.0"
+            tree_learner: cuda
+          - method: source
+            compiler: gcc
+            python_version: "3.8"
+            cuda_version: "11.5.1"
+            tree_learner: cuda_exp
+          - method: pip
+            compiler: clang
+            python_version: "3.9"
+            cuda_version: "10.0"
+            tree_learner: cuda_exp
     steps:
       - name: Setup or update software on host machine
         run: |

@@ -5,6 +5,7 @@ option(USE_SWIG "Enable SWIG to generate Java API" OFF)
 option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
 option(USE_TIMETAG "Set to ON to output time costs" OFF)
 option(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF)
+option(USE_CUDA_EXP "Enable CUDA-accelerated training with more acceleration (EXPERIMENTAL)" OFF)
 option(USE_DEBUG "Set to ON for Debug mode" OFF)
 option(USE_SANITIZER "Use santizer flags" OFF)
 set(
@@ -28,7 +29,7 @@ if(__INTEGRATE_OPENCL)
   cmake_minimum_required(VERSION 3.11)
 elseif(USE_GPU OR APPLE)
   cmake_minimum_required(VERSION 3.2)
-elseif(USE_CUDA)
+elseif(USE_CUDA OR USE_CUDA_EXP)
   cmake_minimum_required(VERSION 3.16)
 else()
   cmake_minimum_required(VERSION 3.0)
@@ -133,7 +134,7 @@ else()
     add_definitions(-DUSE_SOCKET)
 endif()
 
-if(USE_CUDA)
+if(USE_CUDA OR USE_CUDA_EXP)
     set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
     enable_language(CUDA)
     set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
@@ -171,8 +172,12 @@ if(__INTEGRATE_OPENCL)
     endif()
 endif()
 
-if(USE_CUDA)
-    find_package(CUDA 9.0 REQUIRED)
+if(USE_CUDA OR USE_CUDA_EXP)
+    if(USE_CUDA)
+      find_package(CUDA 9.0 REQUIRED)
+    else()
+      find_package(CUDA 10.0 REQUIRED)
+    endif()
     include_directories(${CUDA_INCLUDE_DIRS})
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")
 
@@ -199,7 +204,12 @@ if(USE_CUDA)
     endif()
     message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
 
-    add_definitions(-DUSE_CUDA)
+    if(USE_CUDA)
+      add_definitions(-DUSE_CUDA)
+    elseif(USE_CUDA_EXP)
+      add_definitions(-DUSE_CUDA_EXP)
+    endif()
+
     if(NOT DEFINED CMAKE_CUDA_STANDARD)
       set(CMAKE_CUDA_STANDARD 11)
       set(CMAKE_CUDA_STANDARD_REQUIRED ON)
@@ -369,9 +379,17 @@ file(
       src/objective/*.cpp
       src/network/*.cpp
       src/treelearner/*.cpp
-if(USE_CUDA)
+if(USE_CUDA OR USE_CUDA_EXP)
       src/treelearner/*.cu
 endif()
+if(USE_CUDA_EXP)
+      src/treelearner/cuda/*.cpp
+      src/treelearner/cuda/*.cu
+      src/io/cuda/*.cu
+      src/io/cuda/*.cpp
+      src/cuda/*.cpp
+      src/cuda/*.cu
+endif()
 )
 
 add_library(lightgbm_objs OBJECT ${SOURCES})
@@ -493,14 +511,16 @@ if(__INTEGRATE_OPENCL)
   target_link_libraries(lightgbm_objs PUBLIC ${INTEGRATED_OPENCL_LIBRARIES})
 endif()
 
-if(USE_CUDA)
+if(USE_CUDA OR USE_CUDA_EXP)
   # Disable cmake warning about policy CMP0104. Refer to issue #3754 and PR #4268.
   # Custom target properties does not propagate, thus we need to specify for
   # each target that contains or depends on cuda source.
   set_target_properties(lightgbm_objs PROPERTIES CUDA_ARCHITECTURES OFF)
   set_target_properties(_lightgbm PROPERTIES CUDA_ARCHITECTURES OFF)
   set_target_properties(lightgbm PROPERTIES CUDA_ARCHITECTURES OFF)
 
+  set_target_properties(lightgbm_objs PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
   # Device linking is not supported for object libraries.
   # Thus we have to specify them on final targets.
   set_target_properties(lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)

@@ -37,6 +37,10 @@ OBJECTS = \
     io/parser.o \
     io/train_share_states.o \
     io/tree.o \
+    io/dense_bin.o \
+    io/sparse_bin.o \
+    io/multi_val_dense_bin.o \
+    io/multi_val_sparse_bin.o \
     metric/dcg_calculator.o \
     metric/metric.o \
     objective/objective_function.o \

@@ -38,6 +38,10 @@ OBJECTS = \
     io/parser.o \
     io/train_share_states.o \
     io/tree.o \
+    io/dense_bin.o \
+    io/sparse_bin.o \
+    io/multi_val_dense_bin.o \
+    io/multi_val_sparse_bin.o \
     metric/dcg_calculator.o \
     metric/metric.o \
     objective/objective_function.o \

@@ -636,6 +636,8 @@ To build LightGBM CUDA version, run the following commands:
   cmake -DUSE_CUDA=1 ..
   make -j4
 
+Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``-DUSE_CUDA`` with ``-DUSE_CUDA_EXP`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries.
+
 **Note**: glibc >= 2.14 is required.
 
 **Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this).

@@ -199,7 +199,7 @@ Core Parameters
 
    -  **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
 
--  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, aliases: ``device``
+-  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, ``cuda_exp``, aliases: ``device``
 
    -  device for the tree learning, you can use GPU to achieve the faster learning
 
@@ -209,6 +209,10 @@ Core Parameters
 
    -  **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
 
+   -  **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda``
+
+   -  **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future
+
 -  ``seed`` :raw-html:`<a id="seed" title="Permalink to this parameter" href="#seed">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = int, aliases: ``random_seed``, ``random_state``
 
    -  this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc.

@@ -34,6 +34,8 @@ def get_parameter_infos(
     member_infos: List[List[Dict[str, List]]] = []
     with open(config_hpp) as config_hpp_file:
         for line in config_hpp_file:
+            if line.strip() in {"#ifndef __NVCC__", "#endif  // __NVCC__"}:
+                continue
             if "#pragma region Parameters" in line:
                 is_inparameter = True
             elif "#pragma region" in line and "Parameters" in line:

@@ -119,6 +119,23 @@ class BinMapper {
     }
   }
 
+  /*!
+  * \brief Maximum categorical value
+  * \return Maximum categorical value for categorical features, 0 for numerical features  
+  */
+  inline int MaxCatValue() const {
+    if (bin_2_categorical_.size() == 0) {
+      return 0;
+    }
+    int max_cat_value = bin_2_categorical_[0];
+    for (size_t i = 1; i < bin_2_categorical_.size(); ++i) {
+      if (bin_2_categorical_[i] > max_cat_value) {
+        max_cat_value = bin_2_categorical_[i];
+      }
+    }
+    return max_cat_value;
+  }
+
   /*!
   * \brief Get sizes in byte of this object
   */
@@ -379,6 +396,10 @@ class Bin {
   * \brief Deep copy the bin
   */
   virtual Bin* Clone() = 0;
+
+  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector<BinIterator*>* bin_iterator, const int num_threads) const = 0;
+
+  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0;
 };
 
 
@@ -452,6 +473,14 @@ class MultiValBin {
   static constexpr double multi_val_bin_sparse_threshold = 0.25f;
 
   virtual MultiValBin* Clone() = 0;
+
+  #ifdef USE_CUDA_EXP
+  virtual const void* GetRowWiseData(uint8_t* bit_type,
+    size_t* total_size,
+    bool* is_sparse,
+    const void** out_data_ptr,
+    uint8_t* data_ptr_bit_type) const = 0;
+  #endif  // USE_CUDA_EXP
 };
 
 inline uint32_t BinMapper::ValueToBin(double value) const {