From 328a9f0746a5a91aa0747d55bc5758a4efa29d12 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 30 Mar 2020 16:11:27 +0000
Subject: [PATCH 001/119] Initial CUDA work

---
 CMakeLists.txt                                | 130 ++++++++++-
 include/LightGBM/application.h                |   4 +
 include/LightGBM/bin.h                        |   4 +
 include/LightGBM/config.h                     |   4 +
 include/LightGBM/dataset.h                    |  15 +-
 include/LightGBM/feature_group.h              |  10 +
 include/LightGBM/tree_learner.h               |   3 +-
 python-package/setup.py                       |   2 +-
 src/application/application.cpp               |  16 ++
 src/boosting/gbdt.cpp                         | 206 +++++++++++++++++-
 src/boosting/gbdt.h                           |  27 ++-
 src/c_api.cpp                                 | 199 +++++++++++++++++
 src/io/config.cpp                             |   4 +-
 src/io/config_auto.cpp                        |  12 +
 src/io/dataset.cpp                            |  49 ++++-
 src/io/dense_bin.hpp                          |  13 ++
 src/io/sparse_bin.hpp                         |   3 +
 src/main.cpp                                  |   7 +
 .../data_parallel_tree_learner.cpp            |   5 +-
 src/treelearner/data_partition.hpp            |   2 +
 .../feature_parallel_tree_learner.cpp         |   7 +-
 src/treelearner/parallel_tree_learner.h       |   6 +-
 src/treelearner/serial_tree_learner.cpp       |  14 +-
 src/treelearner/serial_tree_learner.h         |  13 +-
 src/treelearner/tree_learner.cpp              |  11 +
 .../voting_parallel_tree_learner.cpp          |   5 +-
 26 files changed, 731 insertions(+), 40 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a07c3fe79b6..4d81828b640 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,20 @@
+#LGBM_CUDA Added USE_CUDA flag
 if(USE_GPU OR APPLE)
   cmake_minimum_required(VERSION 3.2)
+elseif(USE_CUDA)
+  cmake_minimum_required(VERSION 3.11)
+  enable_language(CUDA)
 else()
   cmake_minimum_required(VERSION 2.8)
 endif()
 
+#LGBM_CUDA
+if(USE_CUDA)
+  PROJECT(lightgbm LANGUAGES C CXX CUDA)
+else()
+  PROJECT(lightgbm LANGUAGES C CXX)
+endif()
+
 PROJECT(lightgbm)
 
 OPTION(USE_MPI "Enable MPI-based parallel learning" OFF)
@@ -12,7 +23,7 @@ OPTION(USE_GPU "Enable GPU-accelerated training" OFF)
 OPTION(USE_SWIG "Enable SWIG to generate Java API" OFF)
 OPTION(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
 OPTION(USE_R35 "Set to ON if your R version is not earlier than 3.5" OFF)
-OPTION(USE_TIMETAG "Set to ON to output time costs" OFF)
+OPTION(USE_CUDA "Enable CUDA-accelerated training" OFF) # LGBM_CUDA
 OPTION(USE_DEBUG "Set to ON for Debug mode" OFF)
 OPTION(BUILD_FOR_R "Set to ON if building lib_lightgbm for use with the R package" OFF)
 
@@ -127,6 +138,101 @@ if(USE_GPU)
     ADD_DEFINITIONS(-DUSE_GPU)
 endif(USE_GPU)
 
+#LGBM_CUDA CUDA-specific code
+if(USE_CUDA)
+    find_package(CUDA REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    LIST(APPEND CMAKE_CUDA_FLAGS -g -Xcompiler=-fopenmp -Xcompiler=-fPIC -Xcompiler=-Wall -lineinfo)
+    CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS 7.0)
+
+    LIST(APPEND CMAKE_CUDA_FLAGS ${CUDA_ARCH_FLAGS})
+    if(CMAKE_BUILD_TYPE MATCHES Release)
+      LIST(APPEND CMAKE_CUDA_FLAGS -03)
+    endif()
+
+    message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+    string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
+    message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+    set(CMAKE_CUDA_FLAGS_DEBUG -G)
+    set(CMAKE_CUDA_FLAGS_RELEASE -lineinfo)
+
+    ADD_DEFINITIONS(-DUSE_CUDA)
+    if (NOT DEFINED CMAKE_CUDA_STANDARD)
+      set(CMAKE_CUDA_STANDARD 11)
+      set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    endif()
+
+    set(BASE_DEFINES
+     -DPOWER_FEATURE_WORKGROUPS=12
+     -DUSE_CONSTANT_BUF=0
+    )
+
+    set(ALLFEATS_DEFINES
+     ${BASE_DEFINES}
+     -DENABLE_ALL_FEATURES
+    )
+
+    set(FULLDATA_DEFINES
+     ${ALLFEATS_DEFINES}
+     -DIGNORE_INDICES
+    )
+
+    #string(REPLACE ";" " " BASE_DEFINES "${BASE_DEFINES}")
+    #string(REPLACE ";" " " ALLFEATS_DEFINES "${ALLFEATS_DEFINES}")
+    #string(REPLACE ";" " " FULLDATA_DEFINES "${FULLDATA_DEFINES}")
+
+    message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES})
+    message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES})
+
+    add_library(histo256_sp_const OBJECT src/treelearner/kernels/histogram256.cu)
+    set_target_properties(histo256_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_definitions(
+      histo256_sp_const PRIVATE
+      -DCONST_HESSIAN=1
+      ${BASE_DEFINES}
+    )
+
+    add_library(histo256_sp OBJECT src/treelearner/kernels/histogram256.cu)
+    set_target_properties(histo256_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_definitions(
+      histo256_sp PRIVATE
+      -DCONST_HESSIAN=0
+      ${BASE_DEFINES}
+    )
+
+    add_library(histo256-allfeats_sp_const OBJECT src/treelearner/kernels/histogram256.cu)
+    set_target_properties(histo256-allfeats_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_definitions(
+      histo256-allfeats_sp_const PRIVATE
+      -DCONST_HESSIAN=1
+      ${ALLFEATS_DEFINES}
+    )
+
+    add_library(histo256-allfeats_sp OBJECT src/treelearner/kernels/histogram256.cu)
+    set_target_properties(histo256-allfeats_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_definitions(
+      histo256-allfeats_sp PRIVATE
+      -DCONST_HESSIAN=0
+      ${ALLFEATS_DEFINES}
+    )
+
+    add_library(histo256-fulldata_sp_const OBJECT src/treelearner/kernels/histogram256.cu)
+    set_target_properties(histo256-fulldata_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_definitions(
+      histo256-fulldata_sp_const PRIVATE
+      -DCONST_HESSIAN=1
+      ${FULLDATA_DEFINES}
+    )
+
+    add_library(histo256-fulldata_sp OBJECT src/treelearner/kernels/histogram256.cu)
+    set_target_properties(histo256-fulldata_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_definitions(
+      histo256-fulldata_sp PRIVATE
+      -DCONST_HESSIAN=0
+      ${FULLDATA_DEFINES}
+    )
+endif(USE_CUDA)
+
 if(USE_HDFS)
     find_package(JNI REQUIRED)
     find_path(HDFS_INCLUDE_DIR hdfs.h REQUIRED)
@@ -224,6 +330,7 @@ if(USE_MPI)
   include_directories(${MPI_CXX_INCLUDE_PATH})
 endif(USE_MPI)
 
+#LGBM_CUDA
 file(GLOB SOURCES
     src/application/*.cpp
     src/boosting/*.cpp
@@ -232,6 +339,9 @@ file(GLOB SOURCES
     src/objective/*.cpp
     src/network/*.cpp
     src/treelearner/*.cpp
+#ifdef USE_CUDA
+    src/treelearner/*cu
+#endif
 )
 
 add_executable(lightgbm src/main.cpp ${SOURCES})
@@ -303,6 +413,24 @@ if(USE_GPU)
   TARGET_LINK_LIBRARIES(_lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
 endif(USE_GPU)
 
+#LGBM_CUDA
+if(USE_CUDA)
+  TARGET_LINK_LIBRARIES(
+    lightgbm
+    histo256_sp_const
+    histo256_sp
+    histo256-fulldata_sp_const
+    histo256-fulldata_sp
+  )
+  TARGET_LINK_LIBRARIES(
+    _lightgbm
+    histo256_sp_const
+    histo256_sp
+    histo256-fulldata_sp_const
+    histo256-fulldata_sp
+  )
+endif(USE_CUDA)
+
 if(USE_HDFS)
   TARGET_LINK_LIBRARIES(lightgbm ${HDFS_CXX_LIBRARIES})
   TARGET_LINK_LIBRARIES(_lightgbm ${HDFS_CXX_LIBRARIES})
diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h
index 66541ec006c..911dedd7d94 100644
--- a/include/LightGBM/application.h
+++ b/include/LightGBM/application.h
@@ -36,6 +36,10 @@ class Application {
   /*! \brief To call this function to run application*/
   inline void Run();
 
+  // LGBM_CUDA
+  /*! \brief call to get configuration */
+  Config GetConfig() {return config_ ;} ;
+
  private:
   /*! \brief Load parameters from command line and config file*/
   void LoadParameters(int argc, char** argv);
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index f817dfabaa8..e541e7039e9 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -288,6 +288,10 @@ class Bin {
   /*! \brief Number of all data */
   virtual data_size_t num_data() const = 0;
 
+  // LGBM_CUDA
+  /*! \brief Get data pointer */
+  virtual void* get_data() = 0;
+
   virtual void ReSize(data_size_t num_data) = 0;
 
   /*!
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 2a3335c1c0a..9622814832b 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -954,6 +954,10 @@ struct Config {
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   bool gpu_use_dp = false;
 
+  // desc = number of gpus (CUDA implementation only) LGBM_CUDA
+  // desc = default value is 1
+  int num_gpu = 1;
+
   #pragma endregion
 
   #pragma endregion
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 802b44b9fc2..e4c5dc56511 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -440,7 +440,10 @@ class Dataset {
     }
     return ret;
   }
-  void ReSize(data_size_t num_data);
+
+  /* LGBM_CUDA  void ReSize(data_size_t num_data); */
+  // LGBM_CUDA ReSize() returns true if resized
+  bool ReSize(data_size_t num_data);
 
   void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
 
@@ -589,6 +592,16 @@ class Dataset {
     return feature_groups_[i]->is_multi_val_;
   }
 
+  // LGBM_CUDA
+  inline size_t FeatureGroupSizesInByte(int group) const {
+    return feature_groups_[group]->FeatureGroupSizesInByte();
+  }
+
+  // LGBM_CUDA
+  inline void* FeatureGroupData(int group) const {
+    return feature_groups_[group]->FeatureGroupData();
+  }
+
   inline double RealThreshold(int i, uint32_t threshold) const {
     const int group = feature2group_[i];
     const int sub_feature = feature2subfeature_[i];
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index 2b17e98bb9c..d949beec20e 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -228,6 +228,16 @@ class FeatureGroup {
     return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
   }
 
+  // LGBM_CUDA
+  inline size_t FeatureGroupSizesInByte() {
+    return bin_data_->SizesInByte();
+  }
+
+  // LGBM_CUDA
+  inline void* FeatureGroupData() {
+    return bin_data_->get_data();
+  }
+
   inline data_size_t Split(int sub_feature, const uint32_t* threshold,
                            int num_threshold, bool default_left,
                            const data_size_t* data_indices, data_size_t cnt,
diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index e0fb3489057..3bc246e8426 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -34,7 +34,8 @@ class TreeLearner {
   * \param train_data The used training data
   * \param is_constant_hessian True if all hessians share the same value
   */
-  virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0;
+  // LGBM_CUDA is_use_subset_ for CUDA
+  virtual void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) = 0;
 
   virtual void ResetIsConstantHessian(bool is_constant_hessian) = 0;
 
diff --git a/python-package/setup.py b/python-package/setup.py
index 73f123baf42..9d8853ddf94 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -212,7 +212,7 @@ def initialize_options(self):
         self.opencl_library = None
         self.mpi = 0
         self.hdfs = 0
-        self.precompile = 0
+        self.precompile = 1
         self.nomp = 0
         self.bit32 = 0
 
diff --git a/src/application/application.cpp b/src/application/application.cpp
index 72e7e489f9b..1b9eabf8a12 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -25,6 +25,10 @@
 
 #include "predictor.hpp"
 
+#ifdef USE_CUDA
+#include <LightGBM/cuda/vector_cudahost.h>
+#endif
+
 namespace LightGBM {
 
 Common::Timer global_timer;
@@ -38,6 +42,18 @@ Application::Application(int argc, char** argv) {
   if (config_.data.size() == 0 && config_.task != TaskType::kConvertModel) {
     Log::Fatal("No training/prediction data, application quit");
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config_.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;}
+      if (config_.bagging_freq == 0) {config_.bagging_freq = 1;}
+  }
+#endif
+
 }
 
 Application::~Application() {
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 7871bbfb086..5f7aac08640 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -17,6 +17,11 @@
 
 namespace LightGBM {
 
+#ifdef USE_CUDA
+int LGBM_config_::current_device=lgbm_device_cpu;
+int LGBM_config_::current_learner=use_cpu_learner;
+#endif
+
 GBDT::GBDT()
     : iter_(0),
       train_data_(nullptr),
@@ -58,6 +63,19 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   es_first_metric_only_ = config_->first_metric_only;
   shrinkage_rate_ = config_->learning_rate;
 
+// LGBM_CUDA
+#ifdef USE_CUDA
+  if (config_->device_type == std::string("cuda")) {
+    // LGBM_config_::current_device=lgbm_device_cuda; moved to application.cpp
+     LGBM_config_::current_learner=use_cuda_learner;
+
+  /* Following are needed to ensure bagging required by the CUDA implementation */
+//  if (config_->bagging_fraction == 1.0){config_->bagging_fraction = 0.8;} moved to application.cpp
+//  if (config_->bagging_freq == 0) {config_->bagging_freq = 1;} moved to application.cpp
+
+  }
+#endif
+
   // load forced_splits file
   if (!config->forcedsplits_filename.empty()) {
     std::ifstream forced_splits_file(config->forcedsplits_filename.c_str());
@@ -107,8 +125,23 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   monotone_constraints_ = config->monotone_constraints;
 
   // if need bagging, create buffer
+  // LGBM_CUDA: for CUDA implementation, this function sets up the is_use_subset_ flag as TRUE and tmp_subset_ is allocated.
   ResetBaggingConfig(config_.get(), true);
 
+  // LGBM_CUDA
+  // Two key changes: position of the initializer is moved from the original code, and init() uses is_use_subset_ flag
+  tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get()));
+
+  // init tree learner
+  // LGBM_CUDA do not copy feature is is_use_subset for initialization
+  // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_)
+
+  if (config_->device_type == std::string("cuda")) {
+    tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_);
+  } else {
+    tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_);
+  }
+
   class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
   if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
     CHECK_EQ(num_tree_per_iteration_, num_class_);
@@ -231,13 +264,19 @@ void GBDT::Bagging(int iter) {
     // set bagging data to tree learner
     if (!is_use_subset_) {
       tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-    } else {
-      // get subset
-      tmp_subset_->ReSize(bag_data_cnt_);
-      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
-                              bag_data_cnt_, false);
-      tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
-                                    bag_data_cnt_);
+    } else { // LGBM_CUDA
+      // NEW get subset
+      bool resized= tmp_subset_->ReSize(bag_data_cnt_);
+
+     if (resized && (config_->device_type == std::string("cuda"))) {
+        size_t bag_gh_size = static_cast<size_t>(bag_data_cnt_) * num_tree_per_iteration_;
+        tmp_gradients_.resize(bag_gh_size);
+        tmp_hessians_.resize(bag_gh_size);
+      }
+
+      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
+
+      tree_learner_->ResetTrainingData(tmp_subset_.get());
     }
   }
 }
@@ -245,13 +284,18 @@ void GBDT::Bagging(int iter) {
 void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
   Common::FunctionTimer fun_timer("GBDT::Train", global_timer);
   bool is_finished = false;
+
+ //LGBM_CUDA
   auto start_time = std::chrono::steady_clock::now();
+
   for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) {
     is_finished = TrainOneIter(nullptr, nullptr);
     if (!is_finished) {
       is_finished = EvalAndCheckEarlyStopping();
     }
+
     auto end_time = std::chrono::steady_clock::now();
+
     // output used time per iteration
     Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double,
               std::milli>(end_time - start_time) * 1e-3, iter + 1);
@@ -334,7 +378,136 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
   return 0.0f;
 }
 
+// LGBM_CUDA
+bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
+
+ // LGBM_CUDA invoke baggging during the first iteration
+ if ((config_->device_type == std::string("cuda")) && (iter_ == 0)) {
+
+    auto start_time = std::chrono::steady_clock::now();
+
+    Bagging(0); 
+  }
+
+  std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
+
+  // boosting first
+  if (gradients == nullptr || hessians == nullptr) {
+    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+      init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true);
+    }
+
+    // LGBM_CUDA
+    auto start_time = std::chrono::steady_clock::now();
+
+    Boosting();
+
+    gradients = gradients_.data();
+    hessians = hessians_.data();
+  }
+
+  // LGBM_CUDA  bagging logic
+  // Bagging(iter_);
+
+  bool should_continue = false;
+  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+
+    // LGBM_CUDA
+    auto start_time = std::chrono::steady_clock::now();
+
+    const size_t offset = static_cast<size_t>(cur_tree_id) * num_data_;
+    std::unique_ptr<Tree> new_tree(new Tree(2));
+
+    if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) {
+
+      auto grad = gradients + offset;
+      auto hess = hessians + offset;
+
+      // LGBM_CUDA
+      auto tmp_grad = tmp_gradients_.data();
+      auto tmp_hess = tmp_hessians_.data();
+
+      // need to copy gradients for bagging subset.
+      if (is_use_subset_ && bag_data_cnt_ < num_data_) {
+      
+        #pragma omp parallel for schedule(static) // LGBM_CUDA
+        for (int i = 0; i < bag_data_cnt_; ++i) {
+          tmp_grad[i] = grad[bag_data_indices_[i]]; // LGBM_CUDA
+          tmp_hess[i] = hess[bag_data_indices_[i]]; // LGBM_CUDA
+        }
+     }
+
+      // LGBM_CUDA
+      new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_));
+    }
+
+    if (new_tree->num_leaves() > 1) {
+      should_continue = true;
+      auto score_ptr = train_score_updater_->score() + offset;
+      auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast<double>(label[i]) - score_ptr[i]; };
+      tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter,
+                                     num_data_, bag_data_indices_.data(), bag_data_cnt_);
+      // shrinkage by learning rate
+      new_tree->Shrinkage(shrinkage_rate_);
+      // update score
+      UpdateScore(new_tree.get(), cur_tree_id);
+      if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) {
+        new_tree->AddBias(init_scores[cur_tree_id]);
+      }
+    } else {
+      // only add default score one-time
+      if (models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
+        double output = 0.0;
+        if (!class_need_train_[cur_tree_id]) {
+          if (objective_function_ != nullptr) {
+            output = objective_function_->BoostFromScore(cur_tree_id);
+          }
+        } else {
+          output = init_scores[cur_tree_id];
+        }
+        new_tree->AsConstantTree(output);
+        // updates scores
+        train_score_updater_->AddScore(output, cur_tree_id);
+        for (auto& score_updater : valid_score_updater_) {
+          score_updater->AddScore(output, cur_tree_id);
+        }
+      }
+
+    // LGBM_CUDA: moved for overlapping data copy w/ other operations
+    int iter_next = iter_ + 1;
+      if (iter_next < config_->num_iterations) {
+
+       auto start_time = std::chrono::steady_clock::now();
+
+       // bagging logic
+       Bagging(iter_next);
+
+      }
+    }
+    // add model
+    models_.push_back(std::move(new_tree));
+  }
+
+  if (!should_continue) {
+    Log::Warning("Stopped training because there are no more leaves that meet the split requirements");
+    if (models_.size() > static_cast<size_t>(num_tree_per_iteration_)) {
+      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+        models_.pop_back();
+      }
+    }
+    return true;
+  }
+
+  ++iter_;
+  return false;
+}
+
 bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
+
+  if (config_->device_type == std::string("cuda")){ //LGBM_CUDA
+     return TrainOneIterCUDA(gradients, hessians);
+  }
+
   Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer);
   std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
   // boosting first
@@ -786,9 +959,22 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
     }
   } else {
     bag_data_cnt_ = num_data_;
-    bag_data_indices_.clear();
-    bagging_runner_.ReSize(0);
-    is_use_subset_ = false;
+    if (config_->device_type == std::string("cuda")){ // LGBM_CUDA
+       if (tmp_subset_ == nullptr){
+          tmp_subset_.reset(new Dataset(bag_data_cnt_));
+          tmp_subset_->CopyFeatureMapperFrom(train_data_);
+          size_t bag_gh_size = static_cast<size_t>(bag_data_cnt_) * num_tree_per_iteration_;
+          tmp_gradients_.resize(bag_gh_size);
+          tmp_hessians_.resize(bag_gh_size);
+          is_use_subset_ = false;
+          bag_data_indices_.clear();
+       }
+    }
+    else {
+      bag_data_indices_.clear();
+      bagging_runner_.ReSize(0);
+      is_use_subset_ = false;
+    }
   }
 }
 
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index f3ece67fec0..d460894d44b 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -22,6 +22,10 @@
 #include <utility>
 #include <vector>
 
+#ifdef USE_CUDA
+#include <LightGBM/cuda/vector_cudahost.h> //LGBM_CUDA
+#endif
+
 #include "score_updater.hpp"
 
 namespace LightGBM {
@@ -144,6 +148,14 @@ class GBDT : public GBDTBase {
   */
   bool TrainOneIter(const score_t* gradients, const score_t* hessians) override;
 
+  /*!
+  * \brief Training logic
+  * \param gradients nullptr for using default objective, otherwise use self-defined boosting
+  * \param hessians nullptr for using default objective, otherwise use self-defined boosting
+  * \return True if cannot train any more
+  */
+  bool TrainOneIterCUDA(const score_t* gradients, const score_t* hessians);  // LGBM_CUDA
+
   /*!
   * \brief Rollback one iteration
   */
@@ -463,10 +475,23 @@ class GBDT : public GBDTBase {
   std::vector<std::unique_ptr<Tree>> models_;
   /*! \brief Max feature index of training data*/
   int max_feature_idx_;
+
+#ifdef USE_CUDA
+  /*! \brief First order derivative of training data */
+  std::vector<score_t,CHAllocator<score_t>> gradients_; // LGBM_CUDA
+  std::vector<score_t,CHAllocator<score_t>> tmp_gradients_; // LGBM_CUDA
+  /*! \brief Second order derivative of training data */
+  std::vector<score_t, CHAllocator<score_t>> hessians_; // LGBM_CUDA
+  std::vector<score_t, CHAllocator<score_t>> tmp_hessians_; // LGBM_CUDA
+#else
   /*! \brief First order derivative of training data */
   std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> gradients_;
-  /*! \brief Secend order derivative of training data */
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> tmp_gradients_;
+  /*! \brief Second order derivative of training data */
   std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> hessians_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> tmp_hessians_;
+#endif
+
   /*! \brief Store the indices of in-bag data */
   std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> bag_data_indices_;
   /*! \brief Number of in-bag data */
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 290f219fa63..f785bc74f19 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -18,6 +18,10 @@
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/threading.h>
 
+#ifdef USE_CUDA
+#include <LightGBM/cuda/vector_cudahost.h>
+#endif
+
 #include <string>
 #include <cstdio>
 #include <functional>
@@ -110,6 +114,22 @@ class Booster {
     if (config_.num_threads > 0) {
       omp_set_num_threads(config_.num_threads);
     }
+
+#ifdef USE_CUDA
+    // Only use CUDA when the data is large (2048 == 256 bins each with at least 8 elements)
+    if (train_data->num_data() < 2048){
+       config_.device_type = std::string("cpu");
+    }
+  
+    if (config_.device_type == std::string("cuda")){
+           LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+  
+            config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+            if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;}
+            if (config_.bagging_freq == 0) {config_.bagging_freq = 1;}
+    }
+#endif
+
     // create boosting
     if (config_.input_model.size() > 0) {
       Log::Warning("Continued train from model is not supported for c_api,\n"
@@ -303,6 +323,17 @@ class Booster {
       omp_set_num_threads(config_.num_threads);
     }
 
+//LGBM_CUDA
+#ifdef USE_CUDA
+    if (config_.device_type == std::string("cuda")){
+        LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+  
+        config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+        if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;}
+        if (config_.bagging_freq == 0) {config_.bagging_freq = 1;}
+    }
+#endif
+
     if (param.count("objective")) {
       // create objective function
       objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective,
@@ -627,6 +658,18 @@ int LGBM_DatasetCreateFromFile(const char* filename,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   DatasetLoader loader(config, nullptr, 1, filename);
   if (reference == nullptr) {
     if (Network::num_machines() == 1) {
@@ -657,6 +700,18 @@ int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   DatasetLoader loader(config, nullptr, 1, nullptr);
   *out = loader.CostructFromSampleData(sample_data, sample_indices, ncol, num_per_col,
                                        num_sample_row,
@@ -768,6 +823,18 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   std::unique_ptr<Dataset> ret;
   int32_t total_nrow = 0;
   for (int j = 0; j < nmat; ++j) {
@@ -859,6 +926,18 @@ int LGBM_DatasetCreateFromCSR(const void* indptr,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   std::unique_ptr<Dataset> ret;
   auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   int32_t nrow = static_cast<int32_t>(nindptr - 1);
@@ -926,6 +1005,18 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   std::unique_ptr<Dataset> ret;
   int32_t nrow = num_rows;
   if (reference == nullptr) {
@@ -997,6 +1088,18 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   std::unique_ptr<Dataset> ret;
   int32_t nrow = static_cast<int32_t>(num_row);
   if (reference == nullptr) {
@@ -1080,6 +1183,18 @@ int LGBM_DatasetGetSubset(
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   auto full_dataset = reinterpret_cast<const Dataset*>(handle);
   CHECK_GT(num_used_row_indices, 0);
   const int32_t lower = 0;
@@ -1475,6 +1590,18 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header,
                        config, result_filename);
@@ -1519,6 +1646,18 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   int nrow = static_cast<int>(nindptr - 1);
@@ -1553,6 +1692,18 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   ref_booster->PredictSingleRow(num_iteration, predict_type, static_cast<int32_t>(num_col), get_row_fun, config, out_result, out_len);
@@ -1582,6 +1733,18 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   int num_threads = OMP_NUM_THREADS();
   int ncol = static_cast<int>(ncol_ptr - 1);
   std::vector<std::vector<CSC_RowIterator>> iterators(num_threads, std::vector<CSC_RowIterator>());
@@ -1626,6 +1789,18 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
   ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun,
@@ -1650,6 +1825,18 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major);
   ref_booster->PredictSingleRow(num_iteration, predict_type, ncol, get_row_fun, config, out_result, out_len);
@@ -1674,6 +1861,18 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
+      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
+  }
+#endif
+
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type);
   ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len);
diff --git a/src/io/config.cpp b/src/io/config.cpp
index d569a7401e1..5d2faba6133 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -126,6 +126,8 @@ void GetDeviceType(const std::unordered_map<std::string, std::string>& params, s
       *device_type = "cpu";
     } else if (value == std::string("gpu")) {
       *device_type = "gpu";
+    } else if (value == std::string("cuda")) { // LGBM_CUDA
+      *device_type = "cuda";
     } else {
       Log::Fatal("Unknown device type %s", value.c_str());
     }
@@ -320,7 +322,7 @@ void Config::CheckParamConflict() {
     }
   }
   // force col-wise for gpu
-  if (device_type == std::string("gpu")) {
+  if (device_type == std::string("gpu")) { // GCF maybe need to add some cuda here?
     force_col_wise = true;
     force_row_wise = false;
   }
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 807cad78502..46d95b0df8f 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -294,6 +294,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "gpu_platform_id",
   "gpu_device_id",
   "gpu_use_dp",
+  "num_gpu", /* LGBM_CUDA */
   });
   return params;
 }
@@ -482,6 +483,11 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetBool(params, "is_enable_sparse", &is_enable_sparse);
 
+  /* Hard-code to address the sparse layout issue. ONLY FOR CUDA IMPLEMENTATION */
+#if 0
+  is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+#endif
+
   GetBool(params, "enable_bundle", &enable_bundle);
 
   GetBool(params, "use_missing", &use_missing);
@@ -605,6 +611,12 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
   GetInt(params, "gpu_device_id", &gpu_device_id);
 
   GetBool(params, "gpu_use_dp", &gpu_use_dp);
+  gpu_use_dp = true;  /* LGBM_CUDA hard-coding gpu_use_dp to TRUE (default is false) */
+
+  /* LGBM_CUDA get number of GPUs */
+  GetInt(params, "num_gpu", &num_gpu);
+  CHECK(num_gpu > 0);
+
 }
 
 std::string Config::SaveMembersToString() const {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 6e17eeb8917..a020f425f3a 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -10,6 +10,10 @@
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
 
+#ifdef USE_CUDA
+#include <LightGBM/cuda/vector_cudahost.h>
+#endif
+
 #include <chrono>
 #include <cstdio>
 #include <limits>
@@ -233,12 +237,17 @@ std::vector<std::vector<int>> FindGroups(
   return features_in_group;
 }
 
-std::vector<std::vector<int>> FastFeatureBundling(
-    const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
-    int** sample_indices, double** sample_values, const int* num_per_col,
-    int num_sample_col, data_size_t total_sample_cnt,
-    const std::vector<int>& used_features, data_size_t num_data,
-    bool is_use_gpu, bool is_sparse, std::vector<int8_t>* multi_val_group) {
+std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
+                                                  int** sample_indices, 
+                                                  double** sample_values, 
+                                                  const int* num_per_col,
+                                                  int num_sample_col, 
+                                                  data_size_t total_sample_cnt,
+                                                  const std::vector<int>& used_features, 
+                                                  data_size_t num_data,
+                                                  bool is_sparse, 
+                                                  std::vector<int8_t>* multi_val_group,
+                                                  bool is_use_gpu) {
   Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
   std::vector<size_t> feature_non_zero_cnt;
   feature_non_zero_cnt.reserve(used_features.size());
@@ -334,13 +343,28 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
         "constant.");
   }
   auto features_in_group = NoGroup(used_features);
+
+//LGBM_CUDA
+#ifdef USE_CUDA
+  if (io_config.device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+  }
+#endif
+
   std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
   if (io_config.enable_bundle && !used_features.empty()) {
-    features_in_group = FastFeatureBundling(
-        *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
-        num_sample_col, static_cast<data_size_t>(total_sample_cnt),
-        used_features, num_data_, io_config.device_type == std::string("gpu"),
-        io_config.is_enable_sparse, &group_is_multi_val);
+    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); // LGBM_CUDA
+    features_in_group = FastFeatureBundling(*bin_mappers, 
+                                            sample_non_zero_indices, 
+                                            sample_values, 
+                                            num_per_col,
+                                            num_sample_col, 
+                                            static_cast<data_size_t>(total_sample_cnt),
+                                            used_features, 
+                                            num_data_, 
+                                            io_config.is_enable_sparse, 
+                                            &group_is_multi_val,
+                                            lgbm_is_gpu_used);
   }
 
   num_features_ = 0;
@@ -758,7 +782,8 @@ void Dataset::CreateValid(const Dataset* dataset) {
   forced_bin_bounds_ = dataset->forced_bin_bounds_;
 }
 
-void Dataset::ReSize(data_size_t num_data) {
+// LGBM_CUDA Resize() returns boolean
+bool Dataset::ReSize(data_size_t num_data) {
   if (num_data_ != num_data) {
     num_data_ = num_data;
     OMP_INIT_EX();
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index d61f7e6489e..99feadf9f7f 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -12,6 +12,12 @@
 #include <cstring>
 #include <vector>
 
+#ifdef USE_CUDA
+#include <LightGBM/cuda/vector_cudahost.h> // LGBM_CUDA
+#endif
+
+#include <LightGBM/utils/openmp_wrapper.h> // LGBM_CUDA
+
 namespace LightGBM {
 
 template <typename VAL_T, bool IS_4BIT>
@@ -362,6 +368,9 @@ class DenseBin : public Bin {
 
   data_size_t num_data() const override { return num_data_; }
 
+ // LGBM_CUDA
+  void* get_data() override { return data_.data(); }
+
   void FinishLoad() override {
     if (IS_4BIT) {
       if (buf_.empty()) {
@@ -456,7 +465,11 @@ class DenseBin : public Bin {
 
  private:
   data_size_t num_data_;
+#ifdef USE_CUDA
+  std::vector<VAL_T, CHAllocator<VAL_T>> data_; // LGBM_CUDA
+#else
   std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
+#endif
   std::vector<uint8_t> buf_;
 
   DenseBin<VAL_T, IS_4BIT>(const DenseBin<VAL_T, IS_4BIT>& other)
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index aa3ed929713..c56cd6da99d 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -408,6 +408,9 @@ class SparseBin : public Bin {
 
   data_size_t num_data() const override { return num_data_; }
 
+  // LGBM_CUDA
+  void* get_data() override { return nullptr; }
+
   void FinishLoad() override {
     // get total non zero size
     size_t pair_cnt = 0;
diff --git a/src/main.cpp b/src/main.cpp
index 8034da82681..ef277ac0c1f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,6 +11,10 @@
 int main(int argc, char** argv) {
   bool success = false;
   try {
+    // LGBM_CUDA
+    std::chrono::duration<double, std::milli> main_time;
+    auto start_main_time = std::chrono::steady_clock::now();
+
     LightGBM::Application app(argc, argv);
     app.Run();
 
@@ -18,6 +22,9 @@ int main(int argc, char** argv) {
     LightGBM::Linkers::MpiFinalizeIfIsParallel();
 #endif
 
+    // LGBM_CUDA
+    main_time = std::chrono::steady_clock::now() - start_main_time;
+    LightGBM::Log::Info("main::main time: %f sec", main_time * 1e-3);
     success = true;
   }
   catch (const std::exception& ex) {
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index 0d6f9df251b..0624bb96249 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -20,9 +20,9 @@ DataParallelTreeLearner<TREELEARNER_T>::~DataParallelTreeLearner() {
 }
 
 template <typename TREELEARNER_T>
-void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
+void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { //LGBM_CUDA
   // initialize SerialTreeLearner
-  TREELEARNER_T::Init(train_data, is_constant_hessian);
+  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); //LGBM_CUDA
   // Get local rank and global machine size
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
@@ -256,6 +256,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, in
 }
 
 // instantiate template classes, otherwise linker cannot find the code
+template class DataParallelTreeLearner<CUDATreeLearner>; // LGBM_CUDA
 template class DataParallelTreeLearner<GPUTreeLearner>;
 template class DataParallelTreeLearner<SerialTreeLearner>;
 
diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp
index 7a6ac031e62..01c5d2606e7 100644
--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -164,6 +164,8 @@ class DataPartition {
   /*! \brief used data count, used for bagging */
   data_size_t used_data_count_;
   ParallelPartitionRunner<data_size_t, true> runner_;
+  // LGBM_CUDA
+  //  bool is_cuda_;
 };
 
 }  // namespace LightGBM
diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp
index c5202f3d706..5cf660ab9c9 100644
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -19,9 +19,9 @@ template <typename TREELEARNER_T>
 FeatureParallelTreeLearner<TREELEARNER_T>::~FeatureParallelTreeLearner() {
 }
 
-template <typename TREELEARNER_T>
-void FeatureParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
-  TREELEARNER_T::Init(train_data, is_constant_hessian);
+template <typename TREELEARNER_T> //LGBM_CUDA
+void FeatureParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
+  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); //LGBM_CUDA
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
 
@@ -77,6 +77,7 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
 }
 
 // instantiate template classes, otherwise linker cannot find the code
+template class FeatureParallelTreeLearner<CUDATreeLearner>; // LGBM_CUDA
 template class FeatureParallelTreeLearner<GPUTreeLearner>;
 template class FeatureParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 137697408e8..35ac432eba3 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -27,7 +27,7 @@ class FeatureParallelTreeLearner: public TREELEARNER_T {
  public:
   explicit FeatureParallelTreeLearner(const Config* config);
   ~FeatureParallelTreeLearner();
-  void Init(const Dataset* train_data, bool is_constant_hessian) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA
 
  protected:
   void BeforeTrain() override;
@@ -54,7 +54,7 @@ class DataParallelTreeLearner: public TREELEARNER_T {
  public:
   explicit DataParallelTreeLearner(const Config* config);
   ~DataParallelTreeLearner();
-  void Init(const Dataset* train_data, bool is_constant_hessian) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA
   void ResetConfig(const Config* config) override;
 
  protected:
@@ -108,7 +108,7 @@ class VotingParallelTreeLearner: public TREELEARNER_T {
  public:
   explicit VotingParallelTreeLearner(const Config* config);
   ~VotingParallelTreeLearner() { }
-  void Init(const Dataset* train_data, bool is_constant_hessian) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; //LGBM_CUDA
   void ResetConfig(const Config* config) override;
 
  protected:
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index db5cd0b4395..6b02411127a 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -25,7 +25,8 @@ SerialTreeLearner::SerialTreeLearner(const Config* config)
 SerialTreeLearner::~SerialTreeLearner() {
 }
 
-void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
+//LGBM_CUDA
+void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   num_features_ = train_data_->num_features();
@@ -324,7 +325,18 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) {
     is_feature_used[feature_index] = 1;
   }
   bool use_subtract = parent_leaf_histogram_array_ != nullptr;
+
+#ifdef USE_CUDA
+  if (LGBM_config_::current_learner == use_cpu_learner){
+      Log::Info("LightGBM-CUDA using CPU ConstructHistograms()");
+      SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); 
+  }
+  else{
+      ConstructHistograms(is_feature_used, use_subtract);
+  }
+#else
   ConstructHistograms(is_feature_used, use_subtract);
+#endif
   FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
 }
 
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index e6ac8e3ad09..fab28542e03 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -26,6 +26,11 @@
 #include "monotone_constraints.hpp"
 #include "split_info.hpp"
 
+// LGBM_CUDA
+#ifdef USE_CUDA
+#include <LightGBM/cuda/vector_cudahost.h>
+#endif
+
 #ifdef USE_GPU
 // Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled.
 // This is necessary to pin the two arrays in memory and make transferring faster.
@@ -48,7 +53,8 @@ class SerialTreeLearner: public TreeLearner {
 
   ~SerialTreeLearner();
 
-  void Init(const Dataset* train_data, bool is_constant_hessian) override;
+  // LGBM_CUDA is_use_subset is used by CUDA only
+  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;
 
   void ResetTrainingData(const Dataset* train_data,
                          bool is_constant_hessian) override {
@@ -201,6 +207,11 @@ class SerialTreeLearner: public TreeLearner {
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
+#elif USE_CUDA //LGBM_CUDA
+  /*! \brief gradients of current iteration, ordered for cache optimized */
+  std::vector<score_t,CHAllocator<score_t>> ordered_gradients_;
+  /*! \brief hessians of current iteration, ordered for cache optimized */
+  std::vector<score_t,CHAllocator<score_t>> ordered_hessians_;
 #else
   /*! \brief gradients of current iteration, ordered for cache optimized */
   std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_gradients_;
diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index 7172f6b655c..df7231e91df 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -5,6 +5,7 @@
 #include <LightGBM/tree_learner.h>
 
 #include "gpu_tree_learner.h"
+#include "cuda_tree_learner.h" // LGBM_CUDA
 #include "parallel_tree_learner.h"
 #include "serial_tree_learner.h"
 
@@ -31,6 +32,16 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
     } else if (learner_type == std::string("voting")) {
       return new VotingParallelTreeLearner<GPUTreeLearner>(config);
     }
+  } else if (device_type == std::string("cuda")) { // LGBM_CUDA
+    if (learner_type == std::string("serial")) {
+      return new CUDATreeLearner(config);
+    } else if (learner_type == std::string("feature")) {
+      return new FeatureParallelTreeLearner<CUDATreeLearner>(config);
+    } else if (learner_type == std::string("data")) {
+      return new DataParallelTreeLearner<CUDATreeLearner>(config);
+    } else if (learner_type == std::string("voting")) {
+      return new VotingParallelTreeLearner<CUDATreeLearner>(config);
+    }
   }
   return nullptr;
 }
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 1c9c36ba8bb..58f5b88d6b0 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -19,8 +19,8 @@ VotingParallelTreeLearner<TREELEARNER_T>::VotingParallelTreeLearner(const Config
 }
 
 template <typename TREELEARNER_T>
-void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
-  TREELEARNER_T::Init(train_data, is_constant_hessian);
+void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { // LGBM_CUDA
+  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
 
@@ -454,6 +454,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
 }
 
 // instantiate template classes, otherwise linker cannot find the code
+template class VotingParallelTreeLearner<CUDATreeLearner>; // LGBM_CUDA
 template class VotingParallelTreeLearner<GPUTreeLearner>;
 template class VotingParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM

From 895d6e43c847a7a9e748b310fcefe2c32957ff9d Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 30 Mar 2020 16:16:02 +0000
Subject: [PATCH 002/119] Initial CUDA work

---
 build_LGBM.232.sh                       |    7 +
 include/LightGBM/cuda/cuda_utils.h      |   38 +
 include/LightGBM/cuda/vector_cudahost.h |   93 ++
 install_LGBM.232.sh                     |    7 +
 src/io/dense_nbits_bin.hpp              |  405 +++++++++
 src/treelearner/cuda_kernel_launcher.cu |  166 ++++
 src/treelearner/cuda_kernel_launcher.h  |   64 ++
 src/treelearner/cuda_tree_learner.cpp   | 1085 +++++++++++++++++++++++
 src/treelearner/cuda_tree_learner.h     |  315 +++++++
 src/treelearner/kernels/histogram256.cu |  372 ++++++++
 src/treelearner/kernels/histogram256.hu |  179 ++++
 11 files changed, 2731 insertions(+)
 create mode 100755 build_LGBM.232.sh
 create mode 100644 include/LightGBM/cuda/cuda_utils.h
 create mode 100644 include/LightGBM/cuda/vector_cudahost.h
 create mode 100755 install_LGBM.232.sh
 create mode 100644 src/io/dense_nbits_bin.hpp
 create mode 100644 src/treelearner/cuda_kernel_launcher.cu
 create mode 100644 src/treelearner/cuda_kernel_launcher.h
 create mode 100644 src/treelearner/cuda_tree_learner.cpp
 create mode 100644 src/treelearner/cuda_tree_learner.h
 create mode 100644 src/treelearner/kernels/histogram256.cu
 create mode 100644 src/treelearner/kernels/histogram256.hu

diff --git a/build_LGBM.232.sh b/build_LGBM.232.sh
new file mode 100755
index 00000000000..24b50c7dfda
--- /dev/null
+++ b/build_LGBM.232.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/bash
+rm -rf build
+mkdir build
+cd build
+#cmake -DUSE_CUDA=1 ..
+cmake ..
+make -j40
diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
new file mode 100644
index 00000000000..6d9407613f6
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -0,0 +1,38 @@
+/*
+ * ibmGBT: IBM CUDA Accelerated LightGBM
+ *
+ * IBM Confidential
+ * (C) Copyright IBM Corp. 2019. All Rights Reserved.
+ *
+ * The source code for this program is not published or otherwise
+ * divested of its trade secrets, irrespective of what has been
+ * deposited with the U.S. Copyright Office.
+ *
+ * US Government Users Restricted Rights - Use, duplication or
+ * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
+ */
+
+#ifndef LGBM_CUDA_UTILS_H
+#define LGBM_CUDA_UTILS_H
+
+//LGBM_CUDA
+
+#ifdef USE_CUDA
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess)
+   {
+      LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+#endif /* USE_CUDA */
+
+#endif
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
new file mode 100644
index 00000000000..b1a235e8a22
--- /dev/null
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -0,0 +1,93 @@
+/*
+ * ibmGBT: IBM CUDA Accelerated LightGBM 
+ *
+ * IBM Confidential
+ * (C) Copyright IBM Corp. 2019. All Rights Reserved.
+ *
+ * The source code for this program is not published or otherwise
+ * divested of its trade secrets, irrespective of what has been
+ * deposited with the U.S. Copyright Office.
+ *
+ * US Government Users Restricted Rights - Use, duplication or
+ * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
+ */
+
+#ifndef LGBM_CUDA_VECTOR_CH_H
+#define LGBM_CUDA_VECTOR_CH_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+//LGBM_CUDA
+
+namespace LightGBM {
+
+#define lgbm_device_cpu 0
+#define lgbm_device_gpu 1
+#define lgbm_device_cuda 2
+
+#define use_cpu_learner 0
+#define use_gpu_learner 1
+#define use_cuda_learner 2
+
+class LGBM_config_ {
+ public:
+  static int current_device; // Default: lgbm_device_cpu 
+  static int current_learner; // Default: use_cpu_learner
+};
+
+} // namespace LightGBM
+
+
+template <class T>
+struct CHAllocator {
+ typedef T value_type;
+ CHAllocator() {}
+ template <class U> CHAllocator(const CHAllocator<U>& other);
+ T* allocate(std::size_t n)
+ {
+   T* ptr;
+   if (n == 0) return NULL;
+   #ifdef USE_CUDA
+      if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){
+          cudaError_t ret= cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
+          if (ret != cudaSuccess){
+             ptr = (T*) malloc(n*sizeof(T));
+          }
+      }
+      else{
+            ptr = (T*) malloc(n*sizeof(T));
+      }
+   #else
+      ptr = (T*) malloc(n*sizeof(T));
+   #endif
+   return ptr;
+ }
+
+ void deallocate(T* p, std::size_t n)
+ {
+    if (p==NULL) return;
+    #ifdef USE_CUDA
+      if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){
+          cudaPointerAttributes attributes;
+          cudaPointerGetAttributes (&attributes, p);
+          if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)){
+              cudaFreeHost(p);
+          }
+      } 
+      else{
+        free(p);
+      }
+    #else
+        free(p);
+    #endif
+ }
+
+};
+template <class T, class U>
+bool operator==(const CHAllocator<T>&, const CHAllocator<U>&);
+template <class T, class U>
+bool operator!=(const CHAllocator<T>&, const CHAllocator<U>&);
+
+#endif
diff --git a/install_LGBM.232.sh b/install_LGBM.232.sh
new file mode 100755
index 00000000000..7af586f4722
--- /dev/null
+++ b/install_LGBM.232.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/bash
+cd python-package
+python setup.py bdist_wheel
+pip uninstall -y lightgbm
+cd dist
+pip install lightgbm-*.whl
+cd ../..
diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp
new file mode 100644
index 00000000000..adf99115626
--- /dev/null
+++ b/src/io/dense_nbits_bin.hpp
@@ -0,0 +1,405 @@
+/*!
+ * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+#ifndef LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
+#define LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
+
+#include <LightGBM/bin.h>
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace LightGBM {
+
+class Dense4bitsBin;
+
+class Dense4bitsBinIterator : public BinIterator {
+ public:
+  explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
+    : bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
+    max_bin_(static_cast<uint8_t>(max_bin)),
+    default_bin_(static_cast<uint8_t>(default_bin)) {
+    if (default_bin_ == 0) {
+      bias_ = 1;
+    } else {
+      bias_ = 0;
+    }
+  }
+  inline uint32_t RawGet(data_size_t idx) override;
+  inline uint32_t Get(data_size_t idx) override;
+  inline void Reset(data_size_t) override {}
+
+ private:
+  const Dense4bitsBin* bin_data_;
+  uint8_t min_bin_;
+  uint8_t max_bin_;
+  uint8_t default_bin_;
+  uint8_t bias_;
+};
+
+class Dense4bitsBin : public Bin {
+ public:
+  friend Dense4bitsBinIterator;
+  Dense4bitsBin(data_size_t num_data)
+    : num_data_(num_data) {
+    int len = (num_data_ + 1) / 2;
+    data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
+    buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
+  }
+
+  ~Dense4bitsBin() {
+  }
+
+  void Push(int, data_size_t idx, uint32_t value) override {
+    const int i1 = idx >> 1;
+    const int i2 = (idx & 1) << 2;
+    const uint8_t val = static_cast<uint8_t>(value) << i2;
+    if (i2 == 0) {
+      data_[i1] = val;
+    } else {
+      buf_[i1] = val;
+    }
+  }
+
+  void ReSize(data_size_t num_data) override {
+    if (num_data_ != num_data) {
+      num_data_ = num_data;
+      const int len = (num_data_ + 1) / 2;
+      data_.resize(len);
+    }
+  }
+
+  inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
+                          const score_t* ordered_gradients, const score_t* ordered_hessians,
+                          HistogramBinEntry* out) const override {
+    const data_size_t rest = num_data & 0x3;
+    data_size_t i = 0;
+    for (; i < num_data - rest; i += 4) {
+      const data_size_t idx0 = data_indices[i];
+      const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf;
+
+      const data_size_t idx1 = data_indices[i + 1];
+      const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf;
+
+      const data_size_t idx2 = data_indices[i + 2];
+      const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf;
+
+      const data_size_t idx3 = data_indices[i + 3];
+      const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf;
+
+      out[bin0].sum_gradients += ordered_gradients[i];
+      out[bin1].sum_gradients += ordered_gradients[i + 1];
+      out[bin2].sum_gradients += ordered_gradients[i + 2];
+      out[bin3].sum_gradients += ordered_gradients[i + 3];
+
+      out[bin0].sum_hessians += ordered_hessians[i];
+      out[bin1].sum_hessians += ordered_hessians[i + 1];
+      out[bin2].sum_hessians += ordered_hessians[i + 2];
+      out[bin3].sum_hessians += ordered_hessians[i + 3];
+
+      ++out[bin0].cnt;
+      ++out[bin1].cnt;
+      ++out[bin2].cnt;
+      ++out[bin3].cnt;
+    }
+
+    for (; i < num_data; ++i) {
+      const data_size_t idx = data_indices[i];
+      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+      out[bin].sum_gradients += ordered_gradients[i];
+      out[bin].sum_hessians += ordered_hessians[i];
+      ++out[bin].cnt;
+    }
+  }
+
+  void ConstructHistogram(data_size_t num_data,
+                          const score_t* ordered_gradients, const score_t* ordered_hessians,
+                          HistogramBinEntry* out) const override {
+    const data_size_t rest = num_data & 0x3;
+    data_size_t i = 0;
+
+    for (; i < num_data - rest; i += 4) {
+      const auto bin0 = (data_[i >> 1]) & 0xf;
+      const auto bin1 = (data_[i >> 1] >> 4) & 0xf;
+      const auto bin2 = (data_[(i >> 1) + 1]) & 0xf;
+      const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf;
+
+      out[bin0].sum_gradients += ordered_gradients[i];
+      out[bin1].sum_gradients += ordered_gradients[i + 1];
+      out[bin2].sum_gradients += ordered_gradients[i + 2];
+      out[bin3].sum_gradients += ordered_gradients[i + 3];
+
+      out[bin0].sum_hessians += ordered_hessians[i];
+      out[bin1].sum_hessians += ordered_hessians[i + 1];
+      out[bin2].sum_hessians += ordered_hessians[i + 2];
+      out[bin3].sum_hessians += ordered_hessians[i + 3];
+
+      ++out[bin0].cnt;
+      ++out[bin1].cnt;
+      ++out[bin2].cnt;
+      ++out[bin3].cnt;
+    }
+    for (; i < num_data; ++i) {
+      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
+      out[bin].sum_gradients += ordered_gradients[i];
+      out[bin].sum_hessians += ordered_hessians[i];
+      ++out[bin].cnt;
+    }
+  }
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
+                          const score_t* ordered_gradients,
+                          HistogramBinEntry* out) const override {
+    const data_size_t rest = num_data & 0x3;
+    data_size_t i = 0;
+    for (; i < num_data - rest; i += 4) {
+      const data_size_t idx0 = data_indices[i];
+      const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf;
+
+      const data_size_t idx1 = data_indices[i + 1];
+      const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf;
+
+      const data_size_t idx2 = data_indices[i + 2];
+      const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf;
+
+      const data_size_t idx3 = data_indices[i + 3];
+      const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf;
+
+      out[bin0].sum_gradients += ordered_gradients[i];
+      out[bin1].sum_gradients += ordered_gradients[i + 1];
+      out[bin2].sum_gradients += ordered_gradients[i + 2];
+      out[bin3].sum_gradients += ordered_gradients[i + 3];
+
+      ++out[bin0].cnt;
+      ++out[bin1].cnt;
+      ++out[bin2].cnt;
+      ++out[bin3].cnt;
+    }
+
+    for (; i < num_data; ++i) {
+      const data_size_t idx = data_indices[i];
+      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+      out[bin].sum_gradients += ordered_gradients[i];
+      ++out[bin].cnt;
+    }
+  }
+
+  void ConstructHistogram(data_size_t num_data,
+                          const score_t* ordered_gradients,
+                          HistogramBinEntry* out) const override {
+    const data_size_t rest = num_data & 0x3;
+    data_size_t i = 0;
+    for (; i < num_data - rest; i += 4) {
+      const auto bin0 = (data_[i >> 1]) & 0xf;
+      const auto bin1 = (data_[i >> 1] >> 4) & 0xf;
+      const auto bin2 = (data_[(i >> 1) + 1]) & 0xf;
+      const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf;
+
+      out[bin0].sum_gradients += ordered_gradients[i];
+      out[bin1].sum_gradients += ordered_gradients[i + 1];
+      out[bin2].sum_gradients += ordered_gradients[i + 2];
+      out[bin3].sum_gradients += ordered_gradients[i + 3];
+
+      ++out[bin0].cnt;
+      ++out[bin1].cnt;
+      ++out[bin2].cnt;
+      ++out[bin3].cnt;
+    }
+    for (; i < num_data; ++i) {
+      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
+      out[bin].sum_gradients += ordered_gradients[i];
+      ++out[bin].cnt;
+    }
+  }
+
+  virtual data_size_t Split(
+    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
+    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
+    data_size_t* lte_indices, data_size_t* gt_indices) const override {
+    if (num_data <= 0) { return 0; }
+    uint8_t th = static_cast<uint8_t>(threshold + min_bin);
+    const uint8_t minb = static_cast<uint8_t>(min_bin);
+    const uint8_t maxb = static_cast<uint8_t>(max_bin);
+    uint8_t t_default_bin = static_cast<uint8_t>(min_bin + default_bin);
+    if (default_bin == 0) {
+      th -= 1;
+      t_default_bin -= 1;
+    }
+    data_size_t lte_count = 0;
+    data_size_t gt_count = 0;
+    data_size_t* default_indices = gt_indices;
+    data_size_t* default_count = &gt_count;
+    if (missing_type == MissingType::NaN) {
+      if (default_bin <= threshold) {
+        default_indices = lte_indices;
+        default_count = &lte_count;
+      }
+      data_size_t* missing_default_indices = gt_indices;
+      data_size_t* missing_default_count = &gt_count;
+      if (default_left) {
+        missing_default_indices = lte_indices;
+        missing_default_count = &lte_count;
+      }
+      for (data_size_t i = 0; i < num_data; ++i) {
+        const data_size_t idx = data_indices[i];
+        const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
+          default_indices[(*default_count)++] = idx;
+        } else if (bin == maxb) {
+          missing_default_indices[(*missing_default_count)++] = idx;
+        } else if (bin > th) {
+          gt_indices[gt_count++] = idx;
+        } else {
+          lte_indices[lte_count++] = idx;
+        }
+      }
+    } else {
+      if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
+        default_indices = lte_indices;
+        default_count = &lte_count;
+      }
+      for (data_size_t i = 0; i < num_data; ++i) {
+        const data_size_t idx = data_indices[i];
+        const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
+          default_indices[(*default_count)++] = idx;
+        } else if (bin > th) {
+          gt_indices[gt_count++] = idx;
+        } else {
+          lte_indices[lte_count++] = idx;
+        }
+      }
+    }
+    return lte_count;
+  }
+
+  virtual data_size_t SplitCategorical(
+    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
+    const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data,
+    data_size_t* lte_indices, data_size_t* gt_indices) const override {
+    if (num_data <= 0) { return 0; }
+    data_size_t lte_count = 0;
+    data_size_t gt_count = 0;
+    data_size_t* default_indices = gt_indices;
+    data_size_t* default_count = &gt_count;
+    if (Common::FindInBitset(threshold, num_threahold, default_bin)) {
+      default_indices = lte_indices;
+      default_count = &lte_count;
+    }
+    for (data_size_t i = 0; i < num_data; ++i) {
+      const data_size_t idx = data_indices[i];
+      const uint32_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+      if (bin < min_bin || bin > max_bin) {
+        default_indices[(*default_count)++] = idx;
+      } else if (Common::FindInBitset(threshold, num_threahold, bin - min_bin)) {
+        lte_indices[lte_count++] = idx;
+      } else {
+        gt_indices[gt_count++] = idx;
+      }
+    }
+    return lte_count;
+  }
+
+  data_size_t num_data() const override { return num_data_; }
+
+  // LGBM_CUDA
+  void* get_data() override { return data_.data(); }
+
+  /*! \brief not ordered bin for dense feature */
+  OrderedBin* CreateOrderedBin() const override { return nullptr; }
+
+  void FinishLoad() override {
+    if (buf_.empty()) { return; }
+    int len = (num_data_ + 1) / 2;
+    for (int i = 0; i < len; ++i) {
+      data_[i] |= buf_[i];
+    }
+    buf_.clear();
+  }
+
+  void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
+    const uint8_t* mem_data = reinterpret_cast<const uint8_t*>(memory);
+    if (!local_used_indices.empty()) {
+      const data_size_t rest = num_data_ & 1;
+      for (int i = 0; i < num_data_ - rest; i += 2) {
+        // get old bins
+        data_size_t idx = local_used_indices[i];
+        const auto bin1 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
+        idx = local_used_indices[i + 1];
+        const auto bin2 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
+        // add
+        const int i1 = i >> 1;
+        data_[i1] = (bin1 | (bin2 << 4));
+      }
+      if (rest) {
+        data_size_t idx = local_used_indices[num_data_ - 1];
+        data_[num_data_ >> 1] = (mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+      }
+    } else {
+      for (size_t i = 0; i < data_.size(); ++i) {
+        data_[i] = mem_data[i];
+      }
+    }
+  }
+
+  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
+    auto other_bin = dynamic_cast<const Dense4bitsBin*>(full_bin);
+    const data_size_t rest = num_used_indices & 1;
+    for (int i = 0; i < num_used_indices - rest; i += 2) {
+      data_size_t idx = used_indices[i];
+      const auto bin1 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
+      idx = used_indices[i + 1];
+      const auto bin2 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
+      const int i1 = i >> 1;
+      data_[i1] = (bin1 | (bin2 << 4));
+    }
+    if (rest) {
+      data_size_t idx = used_indices[num_used_indices - 1];
+      data_[num_used_indices >> 1] = (other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+    }
+  }
+
+  void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
+    writer->Write(data_.data(), sizeof(uint8_t) * data_.size());
+  }
+
+  size_t SizesInByte() const override {
+    return sizeof(uint8_t) * data_.size();
+  }
+
+  Dense4bitsBin* Clone() override {
+    return new Dense4bitsBin(*this);
+  }
+
+ protected:
+  Dense4bitsBin(const Dense4bitsBin& other)
+    : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {}
+
+  data_size_t num_data_;
+  std::vector<uint8_t> data_;
+  std::vector<uint8_t> buf_;
+};
+
+uint32_t Dense4bitsBinIterator::Get(data_size_t idx) {
+  const auto bin = (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+  if (bin >= min_bin_ && bin <= max_bin_) {
+    return bin - min_bin_ + bias_;
+  } else {
+    return default_bin_;
+  }
+}
+
+uint32_t Dense4bitsBinIterator::RawGet(data_size_t idx) {
+  return (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+}
+
+inline BinIterator* Dense4bitsBin::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
+  return new Dense4bitsBinIterator(this, min_bin, max_bin, default_bin);
+}
+
+}  // namespace LightGBM
+#endif   // LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
new file mode 100644
index 00000000000..d084abe4f23
--- /dev/null
+++ b/src/treelearner/cuda_kernel_launcher.cu
@@ -0,0 +1,166 @@
+#ifdef USE_CUDA
+
+#include "cuda_kernel_launcher.h"
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <LightGBM/utils/log.h>
+
+using namespace LightGBM;
+
+void cuda_histogram(
+                data_size_t     leaf_num_data,
+                data_size_t     num_data,
+                bool            use_all_features,
+                bool            is_constant_hessian,
+                int             num_workgroups,
+                cudaStream_t    stream,
+                uint8_t*        arg0,
+                uint8_t*        arg1,
+                data_size_t     arg2,
+                data_size_t*    arg3,
+                data_size_t     arg4,
+                score_t*        arg5,
+                score_t*        arg6,
+                score_t         arg6_const,
+                char*           arg7,
+                volatile int*   arg8,
+                void*		arg9,
+                size_t          exp_workgroups_per_feature) {
+
+
+ if (leaf_num_data == num_data) {
+ 
+   if (use_all_features){
+     if (!is_constant_hessian) {
+       histogram256<<<num_workgroups, 256, 0, stream>>>(
+         arg0,
+         arg1,
+         arg2,
+         reinterpret_cast<const uint*>(arg3),
+         arg4,
+         arg5,
+         static_cast<float*>(arg6),
+         arg7,
+         arg8,
+         static_cast<acc_type*>(arg9),
+         exp_workgroups_per_feature
+       );
+     }
+     else {
+       histogram256<<<num_workgroups, 256, 0, stream>>>(
+         arg0,
+         arg1,
+         arg2,
+         reinterpret_cast<const uint*>(arg3),
+         arg4,
+         arg5,
+         arg6_const,
+         arg7,
+         arg8,
+         static_cast<acc_type*>(arg9),
+         exp_workgroups_per_feature
+       );
+     }
+   }
+   else{   
+   if (!is_constant_hessian) { 
+     histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(
+       arg0,
+       arg1,
+       arg2,
+       reinterpret_cast<const uint*>(arg3),
+       arg4,
+       arg5,
+       static_cast<float*>(arg6),
+       arg7,
+       arg8,
+       static_cast<acc_type*>(arg9),
+       exp_workgroups_per_feature);
+   }
+   else { 
+     histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(
+       arg0,
+       arg1,
+       arg2,
+       reinterpret_cast<const uint*>(arg3),
+       arg4,
+       arg5,
+       arg6_const, 
+       arg7,
+       arg8,
+       static_cast<acc_type*>(arg9),
+       exp_workgroups_per_feature);
+   }
+  }
+ }
+ else {
+   if (use_all_features) {
+     // seems all features is always enabled, so this should be the same as fulldata
+     if (!is_constant_hessian) { 
+
+       histogram256<<<num_workgroups, 256, 0, stream>>>(
+         arg0,
+         arg1,
+         arg2,
+         reinterpret_cast<const uint*>(arg3),
+         arg4,
+         arg5,
+         static_cast<float*>(arg6),
+         arg7,
+         arg8,
+         static_cast<acc_type*>(arg9),
+         exp_workgroups_per_feature
+       );
+     }
+     else { 
+       histogram256<<<num_workgroups, 256, 0, stream>>>(
+         arg0,
+         arg1,
+         arg2,
+         reinterpret_cast<const uint*>(arg3),
+         arg4,
+         arg5,
+         arg6_const, 
+         arg7,
+         arg8,
+         static_cast<acc_type*>(arg9),
+         exp_workgroups_per_feature
+       );
+     } 
+   }
+   else {
+     if (!is_constant_hessian) { 
+       histogram256<<<num_workgroups, 256, 0, stream>>>(
+         arg0,
+         arg1,
+         arg2,
+         reinterpret_cast<const uint*>(arg3),
+         arg4,
+         arg5,
+         static_cast<float*>(arg6),
+         arg7,
+         arg8,
+         static_cast<acc_type*>(arg9),
+         exp_workgroups_per_feature
+       );
+     }
+     else { 
+       histogram256<<<num_workgroups, 256, 0, stream>>>(
+         arg0,
+         arg1,
+         arg2,
+         reinterpret_cast<const uint*>(arg3),
+         arg4,
+         arg5,
+         arg6_const, 
+         arg7,
+         arg8,
+         static_cast<acc_type*>(arg9),
+         exp_workgroups_per_feature
+       );
+     }
+   }
+ }
+}
+
+#endif // USE_CUDA
diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h
new file mode 100644
index 00000000000..ae7d3498e83
--- /dev/null
+++ b/src/treelearner/cuda_kernel_launcher.h
@@ -0,0 +1,64 @@
+#ifndef LGBM_KERNEL_LAUNCHER
+#define LGBM_KERNEL_LAUNCHER
+
+#ifdef USE_CUDA
+// what should I include??
+#include "kernels/histogram256.hu" // kernel, acc_type, data_size_t, uchar, score_t
+#include <chrono>
+
+struct ThreadData {
+          // device id
+          int             device_id;
+          // parameters for cuda_histogram
+          data_size_t     leaf_num_data;
+          data_size_t     num_data;
+          bool            use_all_features;
+          bool            is_constant_hessian;
+          int             num_workgroups;
+          cudaStream_t    stream;
+          uint8_t*        device_features;
+          uint8_t*        device_feature_masks;
+          //data_size_t     num_data;
+          data_size_t*    device_data_indices;
+          //data_size_t     leaf_num_data;
+          score_t*        device_gradients;
+          score_t*        device_hessians;
+          score_t         hessians_const;
+          char*           device_subhistograms;
+          volatile int*   sync_counters;
+          void*           device_histogram_outputs;
+          size_t          exp_workgroups_per_feature;
+          // cuda events
+          cudaEvent_t*    kernel_start;
+          cudaEvent_t*    kernel_wait_obj;
+          std::chrono::duration<double, std::milli>* kernel_input_wait_time;
+          // copy histogram
+          size_t        output_size;
+          char*                 host_histogram_output;
+          cudaEvent_t*          histograms_wait_obj;
+};
+
+
+void cuda_histogram(
+		data_size_t	leaf_num_data, 
+		data_size_t	num_data,
+		bool		use_all_features, 
+		bool		is_constant_hessian, 
+		int		num_workgroups,
+		cudaStream_t	stream,
+		uint8_t*	arg0,
+		uint8_t*	arg1,
+		data_size_t	arg2,
+		data_size_t*	arg3,
+		data_size_t	arg4,
+		score_t*	arg5,
+		score_t*	arg6,
+		score_t		arg6_const,
+		char*		arg7,
+		volatile int*	arg8,
+		void*		arg9,
+		size_t		exp_workgroups_per_feature);
+
+
+#endif //USE_CUDA
+#endif // LGBM_KERNEL_LAUNCHER
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
new file mode 100644
index 00000000000..c45df55cacf
--- /dev/null
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -0,0 +1,1085 @@
+#ifdef USE_CUDA
+#include "cuda_tree_learner.h"
+#include "../io/dense_bin.hpp"
+#include "../io/dense_nbits_bin.hpp"
+
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/network.h>
+#include <LightGBM/bin.h>
+
+#include <algorithm>
+#include <vector>
+
+#include <LightGBM/cuda/cuda_utils.h>
+
+#define cudaMemcpy_DEBUG 0 // 1: DEBUG cudaMemcpy
+#define ResetTrainingData_DEBUG 0 // 1: Debug ResetTrainingData
+
+#include <pthread.h>
+
+#define GPU_DEBUG 0
+
+static void *launch_cuda_histogram(void *thread_data) {
+  ThreadData td = *(ThreadData*)thread_data;
+  int device_id = td.device_id;
+  CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
+
+  // launch cuda kernel
+  cuda_histogram(td.leaf_num_data, td.num_data, td.use_all_features,
+                td.is_constant_hessian, td.num_workgroups, td.stream,
+                td.device_features,
+                td.device_feature_masks,
+                td.num_data,
+                reinterpret_cast<uint*>(td.device_data_indices),
+                td.leaf_num_data,
+                td.device_gradients,
+                td.device_hessians, td.hessians_const,
+                td.device_subhistograms, td.sync_counters,
+                td.device_histogram_outputs,
+                td.exp_workgroups_per_feature);
+
+  CUDASUCCESS_OR_FATAL(cudaGetLastError());
+
+  return NULL;
+}
+
+/*
+static void *wait_event(void *wait_obj) {
+  CUDASUCCESS_OR_FATAL(cudaEventSynchronize(*(cudaEvent_t *)wait_obj));
+}*/
+
+namespace LightGBM {
+
+CUDATreeLearner::CUDATreeLearner(const Config* config)
+  :SerialTreeLearner(config) {
+  use_bagging_ = false;
+  nthreads_ = 0;
+  if(config->gpu_use_dp && USE_DP_FLOAT) Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!");
+  else Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!");
+}
+
+CUDATreeLearner::~CUDATreeLearner() {
+}
+
+
+void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
+
+
+  // initialize SerialTreeLearner
+  SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset);
+
+  // some additional variables needed for GPU trainer
+  num_feature_groups_ = train_data_->num_feature_groups();
+
+ 
+  // LGBM_CUDA: use subset of training data for bagging
+  is_use_subset_ = is_use_subset;  
+
+  // Initialize GPU buffers and kernels & LGBM_CUDA: get device info
+  InitGPU(config_->num_gpu); // LGBM_CUDA
+
+
+}
+
+// some functions used for debugging the GPU histogram construction
+
+void PrintHistograms(HistogramBinEntry* h, size_t size) {
+  size_t total = 0;
+  for (size_t i = 0; i < size; ++i) {
+    printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt);
+    total += h[i].cnt;
+    if ((i & 3) == 3)
+        printf("\n");
+  }
+  printf("\nTotal examples: %lu\n", total);
+}
+
+union Float_t
+{
+    int64_t i;
+    double f;
+    static int64_t ulp_diff(Float_t a, Float_t b) {
+      return abs(a.i - b.i);
+    }
+};
+  
+
+void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
+
+  size_t i;
+  Float_t a, b;
+  for (i = 0; i < size; ++i) {
+    a.f = h1[i].sum_gradients;
+    b.f = h2[i].sum_gradients;
+    int32_t ulps = Float_t::ulp_diff(a, b);
+    if (fabs(h1[i].cnt           - h2[i].cnt != 0)) {
+      printf("idx: %lu, %d != %d, (diff: %d, err_rate: %f)\n", i, h1[i].cnt, h2[i].cnt, h1[i].cnt - h2[i].cnt, (float)(h1[i].cnt - h2[i].cnt)/h2[i].cnt);
+      goto err;
+    } else {
+      printf("idx: %lu, %d == %d\n", i, h1[i].cnt, h2[i].cnt);
+      printf("idx: %lu, pass\n", i);
+    }
+    if (ulps > 0) {
+      printf("idx: %ld, grad %g != %g\n", i, h1[i].sum_gradients, h2[i].sum_gradients);
+      //printf("idx: %ld, grad %g != %g (%d ULPs)\n", i, h1[i].sum_gradients, h2[i].sum_gradients, ulps);
+      goto err;
+    }
+    a.f = h1[i].sum_hessians;
+    b.f = h2[i].sum_hessians;
+    ulps = Float_t::ulp_diff(a, b);
+    if (ulps > 0) {
+      printf("idx: %ld, hessian %g != %g\n", i, h1[i].sum_hessians, h2[i].sum_hessians);
+      //printf("idx: %ld, hessian %g != %g (%d ULPs)\n", i, h1[i].sum_hessians, h2[i].sum_hessians, ulps);
+      // goto err;
+    }
+  }
+  return;
+err:
+  Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i);
+}
+
+int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
+
+  // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
+  // also guarantee that there are at least 2K examples per workgroup
+
+  double x = 256.0 / num_dense_feature_groups_;
+
+  int exp_workgroups_per_feature = (int)ceil(log2(x));
+  double t = leaf_num_data / 1024.0;
+
+  Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons"
+         "Best workgroup size per feature for full utilization is %d\n", (int)ceil(t), (1 << exp_workgroups_per_feature));
+
+  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, (int)ceil(log((double)t)/log(2.0)));
+  if (exp_workgroups_per_feature < 0)
+      exp_workgroups_per_feature = 0;
+  if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature)
+      exp_workgroups_per_feature = kMaxLogWorkgroupsPerFeature;
+
+  return exp_workgroups_per_feature;
+}
+
+void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) {
+
+  // we have already copied ordered gradients, ordered hessians and indices to GPU
+  // decide the best number of workgroups working on one feature4 tuple
+  // set work group size based on feature size
+  // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
+
+
+  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
+  std::vector<int> num_gpu_workgroups;
+  ThreadData *thread_data = (ThreadData*)malloc(sizeof(ThreadData) * num_gpu_);
+
+  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+    int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
+    int num_workgroups = (1 << exp_workgroups_per_feature) * num_gpu_feature_groups;
+    num_gpu_workgroups.push_back(num_workgroups);
+    if (num_workgroups > preallocd_max_num_wg_[device_id]) {
+      preallocd_max_num_wg_.at(device_id) = num_workgroups;
+      CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
+      cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_);
+    }
+    //set thread_data
+    SetThreadData(thread_data, device_id, leaf_num_data, use_all_features,
+                  num_workgroups, exp_workgroups_per_feature);
+  }
+ 
+
+  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+    if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))){
+        fprintf(stderr, "Error in creating threads. Exiting\n");
+        exit(0);
+    }
+  }
+
+  /* Wait for the threads to finish */
+
+  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+    if (pthread_join(*(cpu_threads_[device_id]), NULL)){
+      fprintf(stderr, "Error in joining threads. Exiting\n");
+      exit(0);
+    }
+  }
+
+  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+
+    // copy the results asynchronously. Size depends on if double precision is used
+
+    size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
+    size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
+
+
+    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
+
+
+    CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id]));
+  }
+
+}
+
+
+template <typename HistType>
+void CUDATreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
+  HistType* hist_outputs = (HistType*) host_histogram_outputs_;
+
+  //#pragma omp parallel for schedule(static, num_gpu_)
+  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+
+    auto start_time = std::chrono::steady_clock::now();
+
+    // when the output is ready, the computation is done
+    CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id]));
+  }
+
+  #pragma omp parallel for schedule(static)
+  for(int i = 0; i < num_dense_feature_groups_; ++i) {
+    if (!feature_masks_[i]) {
+      continue;
+    }
+    int dense_group_index = dense_feature_group_map_[i];
+    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
+    int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); 
+
+    for (int j = 0; j < bin_size; ++j) {
+      old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
+      old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians;
+      old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
+    }
+  }
+
+}
+
+// LGBM_CUDA
+void CUDATreeLearner::CountDenseFeatureGroups() {
+
+  num_dense_feature_groups_ = 0;
+
+  for (int i = 0; i < num_feature_groups_; ++i) {
+    if (ordered_bins_[i] == nullptr) {
+      num_dense_feature_groups_++;
+    }
+  }
+  if (!num_dense_feature_groups_) {
+    Log::Warning("GPU acceleration is disabled because no non-trival dense features can be found");
+  }
+
+}
+
+// LGBM_CUDA
+void CUDATreeLearner::prevAllocateGPUMemory() {
+
+
+  // how many feature-group tuples we have
+  // leave some safe margin for prefetching
+  // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
+
+  allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
+
+  // clear sparse/dense maps
+
+  dense_feature_group_map_.clear();
+  sparse_feature_group_map_.clear();
+
+  // do nothing it there is no dense feature
+  if (!num_dense_feature_groups_) {
+    return;
+  }
+
+  // LGBM_CUDA: calculate number of feature groups per gpu
+  num_gpu_feature_groups_.resize(num_gpu_);
+  offset_gpu_feature_groups_.resize(num_gpu_);
+  int num_features_per_gpu = num_dense_feature_groups_ / num_gpu_;
+  int remain_features = num_dense_feature_groups_ - num_features_per_gpu * num_gpu_;
+
+  int offset = 0;
+
+  for(int i = 0; i < num_gpu_; ++i) {
+    offset_gpu_feature_groups_.at(i) = offset;
+    num_gpu_feature_groups_.at(i) = (i < remain_features)? num_features_per_gpu + 1 : num_features_per_gpu;
+    offset += num_gpu_feature_groups_.at(i);
+  }
+
+#if 0
+  // allocate feature mask, for disabling some feature-groups' histogram calculation
+  if (feature_masks_.data() != NULL) {
+     cudaPointerAttributes attributes;
+     cudaPointerGetAttributes (&attributes, feature_masks_.data());
+    
+     if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)){ 
+        CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data()));
+     }
+  }
+#endif
+
+  feature_masks_.resize(num_dense_feature_groups_);
+  Log::Debug("Resized feature masks");
+
+  ptr_pinned_feature_masks_ = feature_masks_.data();
+  Log::Debug("Memset pinned_feature_masks_");
+  memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_);
+
+  // histogram bin entry size depends on the precision (single/double)
+  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry);
+
+  // host_size histogram outputs
+  //  host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_);
+
+  CUDASUCCESS_OR_FATAL(cudaHostAlloc( (void **)&host_histogram_outputs_, (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_),cudaHostAllocPortable));
+
+  // LGBM_CUDA
+  nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_);
+  nthreads_ = std::max(nthreads_, 1);
+}
+
+// LGBM_CUDA: allocate GPU memory for each GPU
+void CUDATreeLearner::AllocateGPUMemory() {
+
+
+  #pragma omp parallel for schedule(static, num_gpu_)
+
+  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+    // do nothing it there is no gpu feature
+    int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
+    if (num_gpu_feature_groups) {
+      CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
+
+      // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers)
+      if ( device_features_[device_id] != NULL ) {
+             CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id]));
+      }
+
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_features_[device_id]),  (size_t)num_gpu_feature_groups * num_data_ * sizeof(uint8_t)));
+      Log::Debug("Allocated device_features_ addr=%p sz=%lu", device_features_[device_id], num_gpu_feature_groups * num_data_);
+
+      // allocate space for gradients and hessians on device
+      // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed
+
+      if (device_gradients_[device_id] != NULL){
+        CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id]));
+      }
+
+      if (device_hessians_[device_id] != NULL){
+        CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id]));
+      }
+
+      if (device_feature_masks_[device_id] != NULL){
+         CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id]));
+      }
+
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), (size_t) allocated_num_data_ * sizeof(score_t)));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]),  (size_t) allocated_num_data_ * sizeof(score_t)));
+
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), (size_t) num_gpu_feature_groups));
+
+      // copy indices to the device
+
+     if (device_feature_masks_[device_id] != NULL){
+        CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); 
+     }
+
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), (size_t) allocated_num_data_ * sizeof(data_size_t)));
+      CUDASUCCESS_OR_FATAL(cudaMemsetAsync(device_data_indices_[device_id], 0, allocated_num_data_ * sizeof(data_size_t), stream_[device_id]));
+
+      Log::Debug("Memset device_data_indices_");
+
+      // create output buffer, each feature has a histogram with device_bin_size_ bins,
+      // each work group generates a sub-histogram of dword_features_ features.
+
+      if (!device_subhistograms_[device_id]) {
+
+  // only initialize once here, as this will not need to change when ResetTrainingData() is called
+        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_));
+
+        Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]);
+
+      }
+
+      // create atomic counters for inter-group coordination
+      CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id]));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int))); 
+      CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int)));
+
+      // The output buffer is allocated to host directly, to overlap compute and data transfer
+      CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id]));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), (size_t) num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_));
+    }
+  }
+
+}
+
+void CUDATreeLearner::ResetGPUMemory() {
+
+  // clear sparse/dense maps
+  dense_feature_group_map_.clear();
+  sparse_feature_group_map_.clear();
+
+}
+
+// LGBM_CUDA
+void CUDATreeLearner::copyDenseFeature() {
+
+ if (num_feature_groups_ == 0){
+      LGBM_config_::current_learner=use_cpu_learner;
+      return;
+  }
+
+//  auto start_time = std::chrono::steady_clock::now();
+  Log::Debug("Started copying dense features from CPU to GPU");
+  // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
+  size_t  copied_feature = 0;
+  // set device info 
+  int device_id = 0;
+  uint8_t* device_features = device_features_[device_id];
+  Log::Debug("Started copying dense features from CPU to GPU - 1");
+
+  for (int i = 0; i < num_feature_groups_; ++i) {
+    // looking for dword_features_ non-sparse feature-groups
+    if (ordered_bins_[i] == nullptr) {
+      dense_feature_group_map_.push_back(i);
+      auto sizes_in_byte = train_data_->FeatureGroupSizesInByte(i);
+      void* tmp_data = train_data_->FeatureGroupData(i);
+  	   Log::Debug("Started copying dense features from CPU to GPU - 2");
+      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(&device_features[copied_feature * num_data_], tmp_data, sizes_in_byte, cudaMemcpyHostToDevice, stream_[device_id]));
+  	   Log::Debug("Started copying dense features from CPU to GPU - 3");
+      copied_feature++;
+      // reset device info
+      if(copied_feature == (size_t) num_gpu_feature_groups_[device_id]) {
+         CUDASUCCESS_OR_FATAL(cudaEventRecord(features_future_[device_id], stream_[device_id]));
+         device_id += 1;
+         copied_feature = 0;
+         if(device_id < num_gpu_) {
+           device_features = device_features_[device_id];
+           //CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); 
+         }
+      }
+    }
+    else {
+      sparse_feature_group_map_.push_back(i);
+    }
+  }
+
+  // data transfer time // LGBM_CUDA: async copy, so it is not the real data transfer time
+  // std::chrono::duration<double, std::milli> end_time = std::chrono::steady_clock::now() - start_time;
+
+}
+
+
+
+// LGBM_CUDA: InitGPU w/ num_gpu
+void CUDATreeLearner::InitGPU(int num_gpu) { 
+
+  // Get the max bin size, used for selecting best GPU kernel
+
+  max_num_bin_ = 0;
+
+  #if GPU_DEBUG >= 1
+  printf("bin_size: ");
+  #endif
+  for (int i = 0; i < num_feature_groups_; ++i) {
+    #if GPU_DEBUG >= 1
+    printf("%d, ", train_data_->FeatureGroupNumBin(i));
+    #endif
+    max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
+  }
+
+  if (max_num_bin_ <= 16) {
+    device_bin_size_ = 256; //LGBM_CUDA
+    dword_features_ = 1; // LGBM_CUDA
+  }
+  else if (max_num_bin_ <= 64) {
+    device_bin_size_ = 256; //LGBM_CUDA
+    dword_features_ = 1; // LGBM_CUDA
+  }
+  else if ( max_num_bin_ <= 256) {
+    Log::Debug("device_bin_size_ = 256");
+    device_bin_size_ = 256;
+    dword_features_ = 1; // LGBM_CUDA
+  }
+  else {
+    Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
+  }
+  if(max_num_bin_ == 65) {
+    Log::Warning("Setting max_bin to 63 is sugguested for best performance");
+  }
+  if(max_num_bin_ == 17) {
+    Log::Warning("Setting max_bin to 15 is sugguested for best performance");
+  }
+
+  // LGBM_CUDA: get num_dense_feature_groups_
+  CountDenseFeatureGroups();
+
+
+  if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_;
+ 
+  // LGBM_CUDA: initialize GPU
+  int gpu_count;
+
+  CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count));
+  num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu;
+
+  // LGBM_CUDA: set cpu threads
+  cpu_threads_ = (pthread_t **)malloc(sizeof(pthread_t *)*num_gpu_);
+  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+    cpu_threads_[device_id] = (pthread_t *)malloc(sizeof(pthread_t)); 
+  }
+
+  // LGBM_CUDA: resize device memory pointers
+  device_features_.resize(num_gpu_);
+  device_gradients_.resize(num_gpu_);
+  device_hessians_.resize(num_gpu_);
+  device_feature_masks_.resize(num_gpu_);
+  device_data_indices_.resize(num_gpu_);
+  sync_counters_.resize(num_gpu_);
+  device_subhistograms_.resize(num_gpu_);
+  device_histogram_outputs_.resize(num_gpu_);
+ 
+  // LGBM_CUDA: create stream & events to handle multiple GPUs
+  preallocd_max_num_wg_.resize(num_gpu_, 1024);
+  stream_.resize(num_gpu_);
+  hessians_future_.resize(num_gpu_);
+  gradients_future_.resize(num_gpu_);
+  indices_future_.resize(num_gpu_);
+  features_future_.resize(num_gpu_);
+  kernel_start_.resize(num_gpu_);
+  kernel_wait_obj_.resize(num_gpu_);
+  histograms_wait_obj_.resize(num_gpu_);
+
+  // for debuging
+  kernel_time_.resize(num_gpu_, 0);
+  kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
+
+  for(int i = 0; i < num_gpu_; ++i) {
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(i));
+    CUDASUCCESS_OR_FATAL(cudaStreamCreate(&(stream_[i])));
+    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(hessians_future_[i])));
+    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(gradients_future_[i])));
+    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(indices_future_[i])));
+    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(features_future_[i])));
+    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_start_[i])));
+    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_wait_obj_[i])));
+    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(histograms_wait_obj_[i])));
+  }
+
+  prevAllocateGPUMemory();
+
+  AllocateGPUMemory();
+
+  // LGBM_CUDA: copy dense feature data from cpu to gpu only when we use entire training data for training
+
+  if (!is_use_subset_) {
+    Log::Debug("copyDenseFeature at the initialization\n");
+    copyDenseFeature(); // LGBM_CUDA
+  }
+
+}
+
+Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians,
+                            bool is_constant_hessian, Json& forced_split_json) {
+
+  // check if we need to recompile the GPU kernel (is_constant_hessian changed)
+  // this should rarely occur
+
+  if (is_constant_hessian != is_constant_hessian_) {
+    Log::Debug("Recompiling GPU kernel because hessian is %sa constant now", is_constant_hessian ? "" : "not ");
+    is_constant_hessian_ = is_constant_hessian;
+  }
+
+  Tree *ret = SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json);
+
+  return ret;
+}
+
+void CUDATreeLearner::ResetTrainingData(const Dataset* train_data) {
+
+  // LGBM_CUDA: check data size
+  data_size_t old_num_data = num_data_;  
+
+  SerialTreeLearner::ResetTrainingData(train_data);
+
+  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA 
+  serial_time = std::chrono::steady_clock::now() - start_serial_time;  
+  #endif
+
+  num_feature_groups_ = train_data_->num_feature_groups();
+
+  // GPU memory has to been reallocated because data may have been changed
+
+  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA
+  auto start_alloc_gpu_time = std::chrono::steady_clock::now();
+  #endif
+
+  // LGBM_CUDA: AllocateGPUMemory only when the number of data increased
+
+  int old_num_feature_groups = num_dense_feature_groups_;
+  CountDenseFeatureGroups();
+  if ((old_num_data < num_data_) && (old_num_feature_groups < num_dense_feature_groups_)) {
+    prevAllocateGPUMemory();
+    AllocateGPUMemory();
+  } else {
+    ResetGPUMemory();
+  }
+
+  copyDenseFeature();
+
+  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA
+  alloc_gpu_time = std::chrono::steady_clock::now() - start_alloc_gpu_time;
+  #endif
+
+  // setup GPU kernel arguments after we allocating all the buffers
+
+  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA
+  auto start_set_arg_time = std::chrono::steady_clock::now();
+  #endif
+
+  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA
+  set_arg_time = std::chrono::steady_clock::now() - start_set_arg_time;
+  reset_training_data_time = std::chrono::steady_clock::now() - start_reset_training_data_time;
+  Log::Info("reset_training_data_time: %f secs.", reset_training_data_time.count() * 1e-3);
+  Log::Info("serial_time: %f secs.", serial_time.count() * 1e-3);
+  Log::Info("alloc_gpu_time: %f secs.", alloc_gpu_time.count() * 1e-3);
+  Log::Info("set_arg_time: %f secs.", set_arg_time.count() * 1e-3); 
+  #endif
+}
+
+void CUDATreeLearner::BeforeTrain() {
+
+  #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA
+  std::chrono::duration<double, std::milli> device_hessians_time = std::chrono::milliseconds(0);
+  std::chrono::duration<double, std::milli> device_gradients_time = std::chrono::milliseconds(0);
+  #endif
+
+  SerialTreeLearner::BeforeTrain();
+
+  #if GPU_DEBUG >= 2
+  printf("CUDATreeLearner::BeforeTrain() Copying initial full gradients and hessians to device\n");
+  #endif
+  
+  // Copy initial full hessians and gradients to GPU.
+  // We start copying as early as possible, instead of at ConstructHistogram().
+
+  if ((hessians_ != NULL) && (gradients_ != NULL)){
+  if (!use_bagging_ && num_dense_feature_groups_) {
+
+    Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_);
+
+    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+      if (!is_constant_hessian_) {
+        Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_");
+
+        #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+        auto start_device_hessians_time = std::chrono::steady_clock::now();
+        #endif
+
+        //const data_size_t* indices = data_partition_->indices();
+        //data_size_t cnt = data_partition_->leaf_count(0);
+
+        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+
+        CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
+
+        #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+        device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time;
+        #endif
+
+        Log::Debug("queued copy of device_hessians_");
+      }
+
+      #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+      auto start_device_gradients_time = std::chrono::steady_clock::now();
+      #endif
+
+      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+      CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
+
+      #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+      device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time;
+      #endif
+
+      Log::Debug("CUDATreeLearner::BeforeTrain: issued gradients_ -> device_gradients_");
+   }
+  }
+  }
+
+#if 0
+  SerialTreeLearner::BeforeTrain();
+#endif
+
+  // use bagging
+  if ((hessians_ != NULL) && (gradients_ != NULL)){
+  if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
+
+    // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram()
+    // copy used gradients and hessians to ordered buffer
+
+    const data_size_t* indices = data_partition_->indices();
+    data_size_t cnt = data_partition_->leaf_count(0);
+
+    // transfer the indices to GPU
+    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+
+      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id]));
+      CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
+
+      if (!is_constant_hessian_) {
+
+        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+        CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
+
+      }
+
+      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+      CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
+    }
+
+  }
+  }
+
+}
+
+bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
+
+  int smaller_leaf;
+
+  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
+  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
+
+
+  // only have root
+  if (right_leaf < 0) {
+    smaller_leaf = -1;
+  } else if (num_data_in_left_child < num_data_in_right_child) {
+    smaller_leaf = left_leaf;
+  } else {
+    smaller_leaf = right_leaf;
+  }
+
+  // Copy indices, gradients and hessians as early as possible
+  if (smaller_leaf >= 0 && num_dense_feature_groups_) {
+    // only need to initialize for smaller leaf
+    // Get leaf boundary
+    const data_size_t* indices = data_partition_->indices();
+    data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
+    data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
+
+    // copy indices to the GPU:
+    #if GPU_DEBUG >= 2
+    #endif
+
+    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], &indices[begin], (end-begin) * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
+      CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
+    }
+  }
+
+  const bool ret = SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf);
+
+  return ret;
+}
+
+bool CUDATreeLearner::ConstructGPUHistogramsAsync(
+  const std::vector<int8_t>& is_feature_used,
+  const data_size_t* data_indices, data_size_t num_data) {
+
+
+  if (num_data <= 0) {
+    return false;
+  }
+
+
+  // do nothing if no features can be processed on GPU
+  if (!num_dense_feature_groups_) {
+    Log::Debug("no dense feature groups, returning");
+    return false;
+  }
+  
+  // copy data indices if it is not null
+  if (data_indices != nullptr && num_data != num_data_) {
+
+    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+
+      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], data_indices, num_data * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
+      CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
+
+    }
+  }
+
+  // converted indices in is_feature_used to feature-group indices
+  std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
+
+  #pragma omp parallel for schedule(static,1024) if (num_features_ >= 2048)
+  for (int i = 0; i < num_features_; ++i) {
+    if(is_feature_used[i]) { 
+      int feature_group = train_data_->Feature2Group(i); // LGBM_CUDA
+      is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group)<=16)? 2 : 1; // LGBM_CUDA
+    }
+  }
+
+  // construct the feature masks for dense feature-groups
+  int used_dense_feature_groups = 0;
+  #pragma omp parallel for schedule(static,1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
+  for (int i = 0; i < num_dense_feature_groups_; ++i) {
+    if (is_feature_group_used[dense_feature_group_map_[i]]) {
+      //feature_masks_[i] = 1;
+      feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]];
+      ++used_dense_feature_groups;
+    }
+    else {
+      feature_masks_[i] = 0;
+    }
+  }
+  bool use_all_features = used_dense_feature_groups == num_dense_feature_groups_;
+  // if no feature group is used, just return and do not use GPU
+  if (used_dense_feature_groups == 0) {
+    return false;
+  }
+
+#if GPU_DEBUG >= 1
+  printf("CudaTreeLearner::ConstructGPUHistogramsAsync() Feature masks: ");
+  for (unsigned int i = 0; i < feature_masks_.size(); ++i) {
+    printf("%d ", feature_masks_[i]);
+  }
+  printf("\n");
+  printf("CudaTreeLearner::ConstructGPUHistogramsAsync() %d feature groups, %d used, %d\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features);
+#endif
+
+  // if not all feature groups are used, we need to transfer the feature mask to GPU
+  // otherwise, we will use a specialized GPU kernel with all feature groups enabled
+  // LGBM_CUDA FIXME: No waiting mark for feature mask
+
+  // LGBM_CUDA We now copy even if all features are used.
+
+    //#pragma omp parallel for schedule(static, num_gpu_)
+    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+      int offset = offset_gpu_feature_groups_[device_id];
+      CUDASUCCESS_OR_FATAL(cudaMemcpy(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice));
+      //CUDASUCCESS_OR_FATAL(cudaMemcpy(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id]));
+    }
+
+  // All data have been prepared, now run the GPU kernel
+
+  GPUHistogram(num_data, use_all_features);
+
+  return true;
+}
+
+void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
+
+  //LGBM_CUDA
+  auto start_time = std::chrono::steady_clock::now();
+
+  std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
+  std::vector<int8_t> is_dense_feature_used(num_features_, 0);
+  int num_dense_features=0, num_sparse_features=0;
+
+  #pragma omp parallel for schedule(static)
+  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+
+    if (!is_feature_used_[feature_index]) continue;
+    if (!is_feature_used[feature_index]) continue;
+    if (ordered_bins_[train_data_->Feature2Group(feature_index)]) {
+      is_sparse_feature_used[feature_index] = 1;
+      num_sparse_features++;
+    }
+    else {
+      is_dense_feature_used[feature_index] = 1;
+      num_dense_features++;
+    }
+  }
+
+  // construct smaller leaf
+  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+
+  // Check workgroups per feature4 tuple..
+  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
+
+  // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU
+  if (exp_workgroups_per_feature == 0){
+    return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
+  }
+
+  // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
+  bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
+    nullptr, smaller_leaf_splits_->num_data_in_leaf());
+
+  // then construct sparse features on CPU
+  // We set data_indices to null to avoid rebuilding ordered gradients/hessians
+  if (num_sparse_features > 0){
+  train_data_->ConstructHistograms(is_sparse_feature_used,
+    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
+    smaller_leaf_splits_->LeafIndex(),
+    ordered_bins_, gradients_, hessians_,
+    ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+    ptr_smaller_leaf_hist_data);
+  }
+
+  // wait for GPU to finish, only if GPU is actually used
+  if (is_gpu_used) {
+    if (config_->gpu_use_dp) {
+      // use double precision
+      WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data);
+    }
+    else {
+      // use single precision
+      WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data);
+    }
+  }
+
+  // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
+  // #define GPU_DEBUG_COMPARE
+#ifdef GPU_DEBUG_COMPARE
+  printf("Start Comparing_Histogram between GPU and CPU num_dense_feature_groups_=%d\n",num_dense_feature_groups_);
+  bool compare = true;
+  for (int i = 0; i < num_dense_feature_groups_; ++i) {
+    if (!feature_masks_[i])
+      continue;
+    int dense_feature_group_index = dense_feature_group_map_[i];
+    size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
+    HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+    HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
+    HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size];
+    data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
+
+    std::copy(current_histogram, current_histogram + size, gpu_histogram);
+    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry));
+    if ( num_data == num_data_ ) {
+      if ( is_constant_hessian_ ) {
+        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_");
+        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+            num_data,
+            gradients_,
+            current_histogram);
+      } else {
+        printf("ConstructHistogram(): num_data == num_data_ ");
+        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+            num_data,
+            gradients_, hessians_,
+            current_histogram);
+      }
+    } else {
+      if ( is_constant_hessian_ ) {
+        printf("ConstructHistogram(): is_constant_hessian_");
+        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+            smaller_leaf_splits_->data_indices(),
+            num_data,
+            ordered_gradients_.data(),
+            current_histogram);
+      } else {  
+        printf("ConstructHistogram(): 4");
+        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+            smaller_leaf_splits_->data_indices(),
+            num_data,
+            ordered_gradients_.data(), ordered_hessians_.data(),
+            current_histogram);
+      }
+    }
+    if ( (num_data != num_data_) && compare ) {
+        CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
+        compare = false;
+    }
+    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
+    std::copy(gpu_histogram, gpu_histogram + size, current_histogram);
+    delete [] gpu_histogram;
+    //break; // LGBM_CUDA: see only first feature info
+  }
+  printf("End Comparing Histogram between GPU and CPU\n");
+//  #endif
+#endif
+
+  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
+
+    // construct larger leaf
+
+    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
+
+    is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
+      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf());
+
+    // then construct sparse features on CPU
+    // We set data_indices to null to avoid rebuilding ordered gradients/hessians
+
+    if (num_sparse_features > 0){
+    train_data_->ConstructHistograms(is_sparse_feature_used,
+      nullptr, larger_leaf_splits_->num_data_in_leaf(),
+      larger_leaf_splits_->LeafIndex(),
+      ordered_bins_, gradients_, hessians_,
+      ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+      ptr_larger_leaf_hist_data);
+    }
+
+    // wait for GPU to finish, only if GPU is actually used
+
+    if (is_gpu_used) {
+      if (config_->gpu_use_dp) {
+        // use double precision
+        WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data);
+      }
+      else {
+        // use single precision
+        WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data);
+      }
+    }
+  }
+}
+
+void CUDATreeLearner::FindBestSplits() {
+
+  SerialTreeLearner::FindBestSplits();
+
+#if GPU_DEBUG >= 3
+  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+    if (!is_feature_used_[feature_index]) continue;
+    if (parent_leaf_histogram_array_ != nullptr
+        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
+      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
+      continue;
+    }
+    size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; 
+    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d smaller leaf:\n", feature_index, bin_size);
+    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
+    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
+    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d larger leaf:\n", feature_index, bin_size);
+
+    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
+  }
+#endif
+}
+
+void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
+  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
+
+#if GPU_DEBUG >= 2
+  printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
+#endif
+
+  SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
+
+  if (Network::num_machines() == 1) {
+    // do some sanity check for the GPU algorithm
+    if (best_split_info.left_count < best_split_info.right_count) {
+      if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) ||
+          (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) {
+        Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
+      }
+    } else {
+      double smaller_min = smaller_leaf_splits_->min_constraint();
+      double smaller_max = smaller_leaf_splits_->max_constraint();
+      double larger_min = larger_leaf_splits_->min_constraint();
+      double larger_max = larger_leaf_splits_->max_constraint();
+      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
+      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
+      smaller_leaf_splits_->SetValueConstraint(smaller_min, smaller_max);
+      larger_leaf_splits_->SetValueConstraint(larger_min, larger_max);
+      if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
+          (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
+        Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
+      }
+    }
+  }
+
+}
+
+}   // namespace LightGBM
+#undef cudaMemcpy_DEBUG
+#endif // USE_CUDA
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
new file mode 100644
index 00000000000..e5a24aeb8f5
--- /dev/null
+++ b/src/treelearner/cuda_tree_learner.h
@@ -0,0 +1,315 @@
+#pragma once
+#ifndef LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
+#define LGBM_TREELEARNET_CUDA_TREE_LEARNER_H_
+
+#include <LightGBM/utils/random.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/tree.h>
+#include <LightGBM/feature_group.h>
+#include "feature_histogram.hpp"
+#include "serial_tree_learner.h"
+#include "data_partition.hpp"
+#include "split_info.hpp"
+#include "leaf_splits.hpp"
+
+#include <cstdio>
+#include <vector>
+#include <random>
+#include <cmath>
+#include <memory>
+
+#ifdef USE_CUDA
+
+#include <LightGBM/cuda/vector_cudahost.h>
+#include "cuda_kernel_launcher.h" // LGBM_CUDA
+#include <cuda_runtime.h>
+
+
+using namespace json11;
+
+namespace LightGBM {
+
+/*!
+* \brief CUDA-based parallel learning algorithm.
+*/
+class CUDATreeLearner: public SerialTreeLearner {
+public:
+  explicit CUDATreeLearner(const Config* tree_config);
+  ~CUDATreeLearner();
+  // LGBM_CUDA: is_use_subset is used by CUDA only
+  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;
+  void ResetTrainingData(const Dataset* train_data) override;
+  Tree* Train(const score_t* gradients, const score_t *hessians,
+              bool is_constant_hessian, Json& forced_split_json) override;
+
+  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
+    SerialTreeLearner::SetBaggingData(used_indices, num_data);
+    // determine if we are using bagging before we construct the data partition
+    // thus we can start data movement to GPU earlier
+    if (used_indices != nullptr) {
+      if (num_data != num_data_) {
+        use_bagging_ = true;
+        return;
+      }
+    }
+    use_bagging_ = false; 
+  }
+
+protected:
+  void BeforeTrain() override;
+  bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
+  void FindBestSplits() override;
+  void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
+  void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
+private:
+  /*! \brief 4-byte feature tuple used by GPU kernels */
+  //struct Feature4 {
+  //    uint8_t s[4];
+  //};
+  
+  /*! \brief Single precision histogram entry for GPU */
+  struct GPUHistogramBinEntry {
+    score_t sum_gradients;
+    score_t sum_hessians;
+    uint32_t cnt;
+  };
+
+
+  /*!
+  * \brief Find the best number of workgroups processing one feature for maximizing efficiency
+  * \param leaf_num_data The number of data examples on the current leaf being processed
+  * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
+  */
+  int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
+  
+  /*!
+  * \brief Initialize GPU device
+  * \LGBM_CUDA: param num_gpu: number of maximum gpus
+  */
+  void InitGPU(int num_gpu);
+
+  /*!
+  * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only
+  */
+  void CountDenseFeatureGroups(); // compute num_dense_feature_group
+  void prevAllocateGPUMemory(); // compute CPU-side param calculation & Pin HostMemory
+  void AllocateGPUMemory();
+
+  /*!
+  * \ LGBM_CUDA: ResetGPUMemory
+  */
+  void ResetGPUMemory();
+
+  /*!
+  * \ LGBM_CUDA: copy dense feature from CPU to GPU
+  */
+  void copyDenseFeature();
+
+
+  /*! 
+   * \brief Compute GPU feature histogram for the current leaf.
+   *        Indices, gradients and hessians have been copied to the device.
+   * \param leaf_num_data Number of data on current leaf
+   * \param use_all_features Set to true to not use feature masks, with a faster kernel
+  */
+  void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
+  
+  void SetThreadData(ThreadData* thread_data, int device_id,
+		int leaf_num_data, bool use_all_features, 
+		int num_workgroups, int exp_workgroups_per_feature) {
+    ThreadData* td = &thread_data[device_id];
+    td->device_id		= device_id;
+    td->leaf_num_data		= leaf_num_data;
+    td->num_data		= num_data_;
+    td->use_all_features	= use_all_features;
+    td->is_constant_hessian	= is_constant_hessian_;
+    td->num_workgroups		= num_workgroups;
+    td->stream			= stream_[device_id];
+    td->device_features		= device_features_[device_id];
+    td->device_feature_masks	= reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
+    td->device_data_indices	= reinterpret_cast<uint*>(device_data_indices_[device_id]);
+    td->device_gradients	= device_gradients_[device_id];
+    td->device_hessians		= device_hessians_[device_id];
+    td->hessians_const		= hessians_[0];
+    td->device_subhistograms	= device_subhistograms_[device_id];
+    td->sync_counters		= sync_counters_[device_id];
+    td->device_histogram_outputs= device_histogram_outputs_[device_id];
+    td->exp_workgroups_per_feature = exp_workgroups_per_feature;
+    
+    td->kernel_start		= &(kernel_start_[device_id]);
+    td->kernel_wait_obj		= &(kernel_wait_obj_[device_id]);
+    td->kernel_input_wait_time  = &(kernel_input_wait_time_[device_id]);
+
+    size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
+    size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
+    td->output_size 		= output_size;
+    td->host_histogram_output	= (char*)host_histogram_outputs_ + host_output_offset;
+    td->histograms_wait_obj	= &(histograms_wait_obj_[device_id]);
+  }
+
+ 
+  // LGBM_CUDA: thread work
+  //typedef void * (*THREADFUNCPTR)(void *);
+  //void* launch_gpu_kernel(void *td);
+ 
+  /*!
+   * \brief Wait for GPU kernel execution and read histogram
+   * \param histograms Destination of histogram results from GPU.
+  */
+  template <typename HistType>
+  void WaitAndGetHistograms(HistogramBinEntry* histograms);
+
+  /*!
+   * \brief Construct GPU histogram asynchronously. 
+   *        Interface is similar to Dataset::ConstructHistograms().
+   * \param is_feature_used A predicate vector for enabling each feature
+   * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
+   *                     Set to nullptr to skip copy to GPU.
+   * \param num_data Number of data examples to be included in histogram
+   * \param gradients Array of gradients for all examples.
+   * \param hessians Array of hessians for all examples.
+   * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, 
+   *                     Set gradients to nullptr to skip copy to GPU.
+   * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, 
+   *                     Set hessians to nullptr to skip copy to GPU.
+   * \return true if GPU kernel is launched, false if GPU is not used
+  */
+  // LGBM_CUDA v5.2
+  bool ConstructGPUHistogramsAsync(
+    const std::vector<int8_t>& is_feature_used,
+    const data_size_t* data_indices, data_size_t num_data); 
+
+
+  /*! brief Log2 of max number of workgroups per feature*/
+  const int kMaxLogWorkgroupsPerFeature = 10; // 2^10
+  /*! brief Max total number of workgroups with preallocated workspace.
+   *        If we use more than this number of workgroups, we have to reallocate subhistograms */
+  //int preallocd_max_num_wg_ = 1024;
+  std::vector<int> preallocd_max_num_wg_;
+
+  /*! \brief True if bagging is used */
+  bool use_bagging_;
+
+  /*! \brief GPU device object */
+  //int* dev_;
+  /*! \brief GPU command queue object */
+  std::vector<cudaStream_t> stream_;
+
+  /*! \brief total number of feature-groups */
+  int num_feature_groups_;
+  /*! \brief total number of dense feature-groups, which will be processed on GPU */
+  int num_dense_feature_groups_;
+  std::vector<int> num_gpu_feature_groups_; // LGBM_CUDA
+  std::vector<int> offset_gpu_feature_groups_; // LGBM_CUDA
+  /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
+   *  With bin size > 16, there are 4 features per DWORD.
+   *  With bin size <=16, there are 8 features per DWORD.
+   * */
+  int dword_features_;
+  /*! \brief total number of dense feature-group tuples on GPU.
+   * Each feature tuple is 4-byte (4 features if each feature takes a byte) */
+  //int num_dense_feature4_;
+  /*! \brief Max number of bins of training data, used to determine 
+   * which GPU kernel to use */
+  int max_num_bin_;
+  /*! \brief Used GPU kernel bin size (64, 256) */
+  int device_bin_size_;
+  /*! \brief Size of histogram bin entry, depending if single or double precision is used */
+  size_t hist_bin_entry_sz_;
+  /*! \brief Indices of all dense feature-groups */
+  std::vector<int> dense_feature_group_map_;
+  /*! \brief Indices of all sparse feature-groups */
+  std::vector<int> sparse_feature_group_map_;
+  /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */
+  //std::vector<int> device_bin_mults_;
+  /*! \brief GPU memory object holding the training data */
+  //uint8_t *device_features_;
+  std::vector<uint8_t*> device_features_;
+  /*! \brief GPU memory object holding the ordered gradient */
+  //score_t *device_gradients_;
+  std::vector<score_t*> device_gradients_;
+  /*! \brief Pointer to pinned memory of ordered gradient */
+  void * ptr_pinned_gradients_ = nullptr;
+  /*! \brief GPU memory object holding the ordered hessian */
+  //score_t *device_hessians_;
+  std::vector<score_t*> device_hessians_;
+  /*! \brief Pointer to pinned memory of ordered hessian */
+  void * ptr_pinned_hessians_ = nullptr;
+  /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
+//  std::vector<char, CHAllocator<char>> feature_masks_; 
+  std::vector<char> feature_masks_;
+  /*! \brief GPU memory object holding the feature masks */
+  //void *device_feature_masks_;
+  std::vector<char*> device_feature_masks_;
+  /*! \brief Pointer to pinned memory of feature masks */
+  char* ptr_pinned_feature_masks_ = nullptr;
+  /*! \brief GPU memory object holding indices of the leaf being processed */
+  //data_size_t *device_data_indices_;
+  std::vector<data_size_t*> device_data_indices_;
+  /*! \brief GPU memory object holding counters for workgroup coordination */
+  //int *sync_counters_;
+  std::vector<int*> sync_counters_;
+  /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
+  //char *device_subhistograms_;
+  std::vector<char*> device_subhistograms_;
+  /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
+  // FIXME: is this cuda mapped
+  //void *device_histogram_outputs_;
+  std::vector<void*> device_histogram_outputs_;
+  /*! \brief Host memory pointer for histogram outputs */
+  void *host_histogram_outputs_;
+  /*! \LGBM_CUDA: CUDA waitlist object for waiting for data transfer before kernel execution */
+  //cudaEvent_t kernel_wait_obj_;
+  std::vector<cudaEvent_t> kernel_wait_obj_;
+  /*! \LGBM_CUDA: CUDA waitlist object for reading output histograms after kernel execution */
+  //cudaEvent_t histograms_wait_obj_;
+  std::vector<cudaEvent_t> histograms_wait_obj_;
+  /*! \LGBM_CUDA: CUDA Asynchronous waiting object for copying indices */
+  //cudaEvent_t indices_future_;
+  std::vector<cudaEvent_t> indices_future_;
+  /*! \LGBM_CUDA: Asynchronous waiting object for copying gradients */
+  //cudaEvent_t gradients_future_;
+  std::vector<cudaEvent_t> gradients_future_;
+  /*! \LGBM_CUDA: Asynchronous waiting object for copying hessians */
+  //cudaEvent_t hessians_future_;
+  std::vector<cudaEvent_t> hessians_future_;
+  // LGBM_CUDA:\brief Asynchronous waiting object for copying dense features
+  //cudaEvent_t features_future_;
+  std::vector<cudaEvent_t> features_future_;
+
+  // LGBM_CUDA: use subset of training data for bagging
+  bool is_use_subset_;
+
+  // LGBM_CUDA: host-side buffer for converting feature data into featre4 data
+  //std::vector<uint8_t*> host_vecs_;
+  int nthreads_; // number of Feature4* vector on host4_vecs_
+  //cudaEvent_t kernel_start_; // event for kernel start
+  std::vector<cudaEvent_t> kernel_start_;
+  std::vector<float> kernel_time_; // measure histogram kernel time
+  std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
+  int num_gpu_;
+  int allocated_num_data_; // allocated data instances
+  pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu
+};
+
+}  // namespace LightGBM
+#else // USE_CUDA
+
+// When GPU support is not compiled in, quit with an error message
+
+namespace LightGBM {
+    
+class CUDATreeLearner: public SerialTreeLearner {
+public:
+  #pragma warning(disable : 4702)
+  explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
+    Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
+               "Please recompile with CMake option -DUSE_CUDA=1");
+  }
+};
+
+}
+
+#endif //USE_CUDA
+#endif // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
diff --git a/src/treelearner/kernels/histogram256.cu b/src/treelearner/kernels/histogram256.cu
new file mode 100644
index 00000000000..5d659f8e2cf
--- /dev/null
+++ b/src/treelearner/kernels/histogram256.cu
@@ -0,0 +1,372 @@
+/*
+ * ibmGBT: IBM CUDA Accelerated LightGBM
+ *
+ * IBM Confidential
+ * (C) Copyright IBM Corp. 2019. All Rights Reserved.
+ *
+ * The source code for this program is not published or otherwise
+ * divested of its trade secrets, irrespective of what has been
+ * deposited with the U.S. Copyright Office.
+ *
+ * US Government Users Restricted Rights - Use, duplication or
+ * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
+ */
+
+#include "histogram256.hu"
+#include "stdio.h"
+
+#define PRINT(b,t,fmt,...) \
+if (b == gtid && t == ltid) { \
+  printf(fmt, __VA_ARGS__); \
+}
+
+
+#ifdef ENABLE_ALL_FEATURES
+#ifdef IGNORE_INDICES
+#define KERNEL_NAME histogram256_fulldata
+#else  // IGNORE_INDICES
+#define KERNEL_NAME histogram256 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
+//#define KERNEL_NAME histogram256_allfeats
+#endif // IGNORE_INDICES
+#else // ENABLE_ALL_FEATURES
+#error "ENABLE_ALL_FEATURES should always be 1"
+#define KERNEL_NAME histogram256
+#endif // ENABLE_ALL_FEATURES
+
+
+// atomic add for float number in local memory
+inline __device__ void atomic_local_add_f(acc_type *addr, const float val)
+{
+    atomicAdd(addr, static_cast<acc_type>(val));
+}
+
+// this function will be called by histogram256
+// we have one sub-histogram of one feature in local memory, and need to read others
+inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist,
+                           const uint skip_id,
+                           const uint old_val_cont_bin0,
+                           const ushort num_sub_hist,
+                           acc_type* __restrict__ output_buf,
+                           acc_type* __restrict__ local_hist,
+                           const size_t power_feature_workgroups) {
+    const ushort ltid = threadIdx.x;
+    // TODO: try to avoid bank conflict here
+    acc_type grad_bin = local_hist[ltid * 2];
+    acc_type hess_bin = local_hist[ltid * 2 + 1];
+    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
+
+    uint cont_bin;
+    if (power_feature_workgroups != 0) {
+      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
+    } else {
+      cont_bin = local_cnt[ltid];
+    }
+    ushort i;
+
+    if (power_feature_workgroups != 0) {
+        // add all sub-histograms for feature
+        const acc_type* __restrict__ p = feature_sub_hist + ltid;
+        for (i = 0; i < skip_id; ++i) {
+            grad_bin += *p;          p += NUM_BINS;
+            hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+
+        // skip the counters we already have
+        p += 3 * NUM_BINS;  
+
+        for (i = i + 1; i < num_sub_hist; ++i) {
+            grad_bin += *p;          p += NUM_BINS;
+            hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+    }
+    __syncthreads();
+
+
+    output_buf[ltid * 3 + 0] = grad_bin;
+    output_buf[ltid * 3 + 1] = hess_bin;
+    output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); 
+}
+
+#if USE_CONSTANT_BUF == 1
+__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, 
+                      __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
+                      const data_size_t feature_size,
+                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
+                      const data_size_t num_data, 
+                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
+#if CONST_HESSIAN == 0
+                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
+#else
+                      const score_t const_hessian,
+#endif
+                      char* __restrict__ output_buf,
+                      volatile int * sync_counters,
+                      acc_type* __restrict__ hist_buf_base,
+                      const size_t power_feature_workgroups) {
+#else
+__global__ void KERNEL_NAME(const uchar* feature_data_base, 
+                      // FIXME: how to handle this __constant
+                      const uchar* __restrict__ feature_masks,
+                      const data_size_t feature_size,
+                      const data_size_t* data_indices, 
+                      const data_size_t num_data, 
+                      const score_t*  ordered_gradients, 
+#if CONST_HESSIAN == 0
+                      const score_t*  ordered_hessians,
+#else
+                      const score_t const_hessian,
+#endif
+                      char* __restrict__ output_buf, 
+                      volatile int * sync_counters,
+                      acc_type* __restrict__ hist_buf_base,
+                      const size_t power_feature_workgroups) {
+#endif
+     // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
+     // otherwise a "Misaligned Address" exception may occur
+     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
+     const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
+     const ushort ltid = threadIdx.x;
+     const ushort lsize = LOCAL_SIZE_0; // get_local_size(0);
+     const ushort group_id = blockIdx.x;
+
+     // local memory per workgroup is 3 KB
+     // clear local memory
+     uint *ptr = (uint *) shared_array;
+     for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+         ptr[i] = 0;
+     }
+     __syncthreads();
+     // gradient/hessian histograms
+     // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
+     // total size: 2 * 256 * size_of(float) = 2 KB
+     // organization: each feature/grad/hessian is at a different bank,
+     //               as indepedent of the feature value as possible
+     acc_type *gh_hist = (acc_type *)shared_array;
+
+     // counter histogram
+     // total size: 256 * size_of(uint) = 1 KB
+     uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
+
+     // odd threads (1, 3, ...) compute histograms for hessians first
+     // even thread (0, 2, ...) compute histograms for gradients first
+     // etc.
+     uchar is_hessian_first = ltid & 1;
+
+     ushort feature_id = group_id >> power_feature_workgroups;
+
+     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
+     // feature_size is the number of examples per feature
+     const uchar *feature_data = feature_data_base + feature_id * feature_size;
+
+     // size of threads that process this feature4
+     const uint subglobal_size = lsize * (1 << power_feature_workgroups);
+
+     // equavalent thread ID in this subgroup for this feature4
+     const uint subglobal_tid  = gtid - feature_id * subglobal_size;
+
+
+     data_size_t ind;
+     data_size_t ind_next;
+     #ifdef IGNORE_INDICES
+     ind = subglobal_tid;
+     #else
+     ind = data_indices[subglobal_tid];
+     #endif
+
+     // extract feature mask, when a byte is set to 0, that feature is disabled
+     uchar feature_mask = feature_masks[feature_id];
+     // exit if the feature is masked
+     if (!feature_mask) {
+         return;
+     } else {
+         feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+     }
+
+     // STAGE 1: read feature data, and gradient and hessian
+     // first half of the threads read feature data from global memory
+     // We will prefetch data into the "next" variable at the beginning of each iteration
+     uchar feature;
+     uchar feature_next;
+     //uint8_t bin;
+     ushort bin;
+
+     feature = feature_data[ind >> feature_mask];
+     if (feature_mask) {
+        feature = (feature >> ((ind & 1) << 2)) & 0xf;
+     }
+     bin = feature;
+     acc_type grad_bin = 0.0f, hess_bin = 0.0f;
+     acc_type *addr_bin;
+
+     // store gradient and hessian
+     score_t grad, hess;
+     score_t grad_next, hess_next;
+     // LGBM_CUDA v5.1
+     grad = ordered_gradients[ind];
+     #if CONST_HESSIAN == 0
+     hess = ordered_hessians[ind];
+     #endif
+
+
+     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
+     for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+         // prefetch the next iteration variables
+         // we don't need bondary check because we have made the buffer large
+         #ifdef IGNORE_INDICES
+         // we need to check to bounds here
+         ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
+         #else
+         ind_next = data_indices[i + subglobal_size];
+         #endif
+
+         // imbGBT v5.1
+         grad_next = ordered_gradients[ind_next];
+         #if CONST_HESSIAN == 0
+         hess_next = ordered_hessians[ind_next];
+         #endif
+
+         // STAGE 2: accumulate gradient and hessian
+         if (bin != feature) {
+             addr_bin = gh_hist + bin * 2 + is_hessian_first;
+             #if CONST_HESSIAN == 0
+             acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
+             atomic_local_add_f(addr_bin, acc_bin);
+
+             addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+             acc_bin = is_hessian_first? grad_bin : hess_bin;             
+             atomic_local_add_f(addr_bin, acc_bin);
+
+             #elif CONST_HESSIAN == 1
+             atomic_local_add_f(addr_bin, grad_bin);
+             #endif
+
+             bin = feature;
+             grad_bin = grad;
+             hess_bin = hess;
+         }
+         else {
+             grad_bin += grad;
+             hess_bin += hess;
+         }
+
+         // prefetch the next iteration variables
+         feature_next = feature_data[ind_next >> feature_mask];
+
+         // STAGE 3: accumulate counter
+         atomicAdd(cnt_hist + feature, 1);
+
+         // STAGE 4: update next stat
+         grad = grad_next;
+         hess = hess_next;
+         // LGBM_CUDA: v4.2
+         if (!feature_mask) {
+             feature = feature_next;
+         } else {
+             feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
+         }
+     }
+
+
+     addr_bin = gh_hist + bin * 2 + is_hessian_first;
+     #if CONST_HESSIAN == 0
+     acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
+     atomic_local_add_f(addr_bin, acc_bin);
+
+     addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+     acc_bin = is_hessian_first? grad_bin : hess_bin;
+     atomic_local_add_f(addr_bin, acc_bin);
+
+     #elif CONST_HESSIAN == 1
+     atomic_local_add_f(addr_bin, grad_bin);
+     #endif
+     __syncthreads();
+
+     #if CONST_HESSIAN == 1
+     // make a final reduction
+     gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
+     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position 
+     __syncthreads();
+     #endif
+
+#if POWER_FEATURE_WORKGROUPS != 0
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
+     // write gradients and hessians
+     acc_type *__restrict__ ptr_f = output;
+     for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+         // even threads read gradients, odd threads read hessians
+         // FIXME: 2-way bank conflict
+         acc_type value = gh_hist[i];
+         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
+     }
+     // write counts
+     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
+     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+         // FIXME: 2-way bank conflict
+         uint value = cnt_hist[i];
+         ptr_i[i] = value;
+     }
+     // FIXME: is this right
+     __syncthreads();
+     __threadfence();
+     // To avoid the cost of an extra reducting kernel, we have to deal with some
+     // gray area in OpenCL. We want the last work group that process this feature to
+     // make the final reduction, and other threads will just quit.
+     // This requires that the results written by other workgroups available to the
+     // last workgroup (memory consistency)
+     #if NVIDIA == 1
+     // this is equavalent to CUDA __threadfence();
+     // ensure the writes above goes to main memory and other workgroups can see it
+     asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
+     #else
+     // FIXME: how to do the above on AMD GPUs??
+     // GCN ISA says that the all writes will bypass L1 cache (write through),
+     // however when the last thread is reading sub-histogram data we have to
+     // make sure that no part of data is modified in local L1 cache of other workgroups.
+     // Otherwise reading can be a problem (atomic operations to get consistency).
+     // But in our case, the sub-histogram of this workgroup cannot be in the cache
+     // of another workgroup, so the following trick will work just fine.
+     #endif
+     // Now, we want one workgroup to do the final reduction.
+     // Other workgroups processing the same feature quit.
+     // The is done by using an global atomic counter.
+     // On AMD GPUs ideally this should be done in GDS,
+     // but currently there is no easy way to access it via OpenCL.
+     uint * counter_val = cnt_hist;     
+     // backup the old value
+     uint old_val = *counter_val;
+     if (ltid == 0) {
+         // all workgroups processing the same feature add this counter
+         *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
+     }
+     // make sure everyone in this workgroup is here
+     __syncthreads();
+     // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+     if (*counter_val == (1 << power_feature_workgroups) - 1) {
+         if (ltid == 0) {
+             sync_counters[feature_id] = 0;
+         }
+     //}
+ #else
+     }
+     // only 1 work group, no need to increase counter
+     // the reduction will become a simple copy
+     if (1) {
+         uint old_val; // dummy
+ #endif
+         // locate our feature's block in output memory
+         uint output_offset = (feature_id << power_feature_workgroups);
+         acc_type const * __restrict__ feature_subhists =
+                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
+         // skip reading the data already in local memory
+         //uint skip_id = feature_id ^ output_offset;
+         uint skip_id = group_id - output_offset;
+         // locate output histogram location for this feature4
+         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS;
+
+         
+         within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
+     }
+}
+
diff --git a/src/treelearner/kernels/histogram256.hu b/src/treelearner/kernels/histogram256.hu
new file mode 100644
index 00000000000..145a85367f0
--- /dev/null
+++ b/src/treelearner/kernels/histogram256.hu
@@ -0,0 +1,179 @@
+/*
+ * ibmGBT: IBM CUDA Accelerated LightGBM
+ *
+ * IBM Confidential
+ * (C) Copyright IBM Corp. 2019. All Rights Reserved.
+ *
+ * The source code for this program is not published or otherwise
+ * divested of its trade secrets, irrespective of what has been
+ * deposited with the U.S. Copyright Office.
+ *
+ * US Government Users Restricted Rights - Use, duplication or
+ * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
+ */
+
+#ifndef _HISTOGRAM_256_KERNEL_
+#define _HISTOGRAM_256_KERNEL_
+
+//#pragma once
+
+// use double precision or not
+#ifndef USE_DP_FLOAT
+#define USE_DP_FLOAT 1
+#endif
+
+// ignore hessian, and use the local memory for hessian as an additional bank for gradient
+#ifndef CONST_HESSIAN
+#define CONST_HESSIAN 0
+#endif
+
+typedef unsigned char uchar;
+
+template<typename T>
+__device__ double as_double(const T t) {
+  static_assert(sizeof(T) == sizeof(double), "size mismatch");
+  double d; 
+  memcpy(&d, &t, sizeof(T)); 
+  return d;
+}
+template<typename T>
+__device__ ulong as_ulong(const T t) {
+  static_assert(sizeof(T) == sizeof(ulong), "size mismatch");
+  ulong u; 
+  memcpy(&u, &t, sizeof(T)); 
+  return u;
+}
+template<typename T>
+__device__ float as_float(const T t) {
+  static_assert(sizeof(T) == sizeof(float), "size mismatch");
+  float f; 
+  memcpy(&f, &t, sizeof(T)); 
+  return f;
+}
+template<typename T>
+__device__ uint as_uint(const T t) {
+  static_assert(sizeof(T) == sizeof(uint), "size_mismatch");
+  uint u; 
+  memcpy(&u, &t, sizeof(T)); 
+  return u;
+}
+template<typename T>
+__device__ uchar4 as_uchar4(const T t) {
+  static_assert(sizeof(T) == sizeof(uchar4), "size mismatch");
+  uchar4 u; 
+  memcpy(&u, &t, sizeof(T)); 
+  return u;
+}
+
+
+#define LOCAL_SIZE_0 256
+#define NUM_BINS 256
+#if USE_DP_FLOAT == 1
+typedef double acc_type;
+typedef ulong acc_int_type;
+#define as_acc_type as_double
+#define as_acc_int_type as_ulong
+#else
+typedef float acc_type;
+typedef uint acc_int_type;
+#define as_acc_type as_float
+#define as_acc_int_type as_uint
+#endif
+//#define LOCAL_MEM_SIZE (4 * (sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
+#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
+
+// unroll the atomic operation for a few times. Takes more code space, 
+// but compiler can generate better code for faster atomics.
+#define UNROLL_ATOMIC 1
+
+// Options passed by compiler at run time:
+// IGNORE_INDICES will be set when the kernel does not 
+//#define IGNORE_INDICES
+//#define POWER_FEATURE_WORKGROUPS 10
+
+// detect Nvidia platforms
+#ifdef cl_nv_pragma_unroll
+#define NVIDIA 1
+#endif
+
+// use all features and do not use feature mask
+#ifndef ENABLE_ALL_FEATURES
+#define ENABLE_ALL_FEATURES 1
+#endif
+
+// use binary patching for AMD GCN 1.2 or newer
+#ifndef AMD_USE_DS_ADD_F32
+#define AMD_USE_DS_ADD_F32 0
+#endif
+
+typedef uint data_size_t;
+typedef float score_t;
+
+
+// define all of the different kernels
+
+#define DECLARE_CONST_BUF(name) \
+__global__ void name(__global const uchar* restrict feature_data_base, \
+                     const uchar* restrict feature_masks,\
+                     const data_size_t feature_size,\
+                     const data_size_t* restrict data_indices, \
+                     const data_size_t num_data, \
+                     const score_t* restrict ordered_gradients, \
+                     const score_t* restrict ordered_hessians,\
+                     char* __restrict__ output_buf,\
+                     volatile int * sync_counters,\
+                     acc_type* __restrict__ hist_buf_base, \
+                     const size_t power_feature_workgroups);
+
+
+#define DECLARE_CONST_HES_CONST_BUF(name) \
+__global__ void name(const uchar* __restrict__ feature_data_base, \
+                     const uchar* __restrict__ feature_masks,\
+                     const data_size_t feature_size,\
+                     const data_size_t* __restrict__ data_indices, \
+                     const data_size_t num_data, \
+                     const score_t* __restrict__ ordered_gradients, \
+                     const score_t const_hessian,\
+                     char* __restrict__ output_buf,\
+                     volatile int * sync_counters,\
+                     acc_type* __restrict__ hist_buf_base, \
+                     const size_t power_feature_workgroups);
+
+
+
+#define DECLARE_CONST_HES(name) \
+__global__ void name(const uchar* feature_data_base, \
+                     const uchar* __restrict__ feature_masks,\
+                     const data_size_t feature_size,\
+                     const data_size_t* data_indices, \
+                     const data_size_t num_data, \
+                     const score_t*  ordered_gradients, \
+                     const score_t const_hessian,\
+                     char* __restrict__ output_buf, \
+                     volatile int * sync_counters,\
+                     acc_type* __restrict__ hist_buf_base, \
+                     const size_t power_feature_workgroups);
+
+
+#define DECLARE(name) \
+__global__ void name(const uchar* feature_data_base, \
+                     const uchar* __restrict__ feature_masks,\
+                     const data_size_t feature_size,\
+                     const data_size_t* data_indices, \
+                     const data_size_t num_data, \
+                     const score_t*  ordered_gradients, \
+                     const score_t*  ordered_hessians,\
+                     char* __restrict__ output_buf, \
+                     volatile int * sync_counters,\
+                     acc_type* __restrict__ hist_buf_base, \
+                     const size_t power_feature_workgroups);
+
+
+DECLARE_CONST_HES(histogram256_allfeats);
+DECLARE_CONST_HES(histogram256_fulldata);
+DECLARE_CONST_HES(histogram256);
+DECLARE(histogram256_allfeats);
+DECLARE(histogram256_fulldata);
+DECLARE(histogram256);
+
+#endif // _HITOGRAM_256_KERNEL_

From fc981bf68e13dea264a95cfa4c081dd90b1faf5d Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 30 Mar 2020 17:08:38 +0000
Subject: [PATCH 003/119] Initial CUDA work

---
 src/boosting/gbdt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 5f7aac08640..a10f77f574d 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -276,7 +276,7 @@ void GBDT::Bagging(int iter) {
 
       tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
 
-      tree_learner_->ResetTrainingData(tmp_subset_.get());
+      tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_ );
     }
   }
 }

From 581ce4a5fa38b3aa4b8bd5c5076716efe974de38 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 30 Mar 2020 17:35:27 +0000
Subject: [PATCH 004/119] Initial CUDA work

---
 src/treelearner/parallel_tree_learner.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 35ac432eba3..2fdf542d421 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -14,6 +14,7 @@
 
 #include "gpu_tree_learner.h"
 #include "serial_tree_learner.h"
+#include "cuda_tree_learner.h"
 
 namespace LightGBM {
 

From 3f98c73e80b55721e08868d4718a1b9c7eb1a5e8 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 30 Mar 2020 21:18:25 +0000
Subject: [PATCH 005/119] Initial CUDA work

---
 src/boosting/gbdt.cpp      | 12 ++++++++++--
 src/io/dense_nbits_bin.hpp |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index a10f77f574d..58a1976d96b 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -96,7 +96,15 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get()));
 
   // init tree learner
-  tree_learner_->Init(train_data_, is_constant_hessian_);
+  // LGBM_CUDA do not copy feature is is_use_subset for initialization
+  // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_)
+
+  if (config_->device_type == std::string("cuda")) {
+     tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_);
+  } else {
+    tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_);
+  }
+
   tree_learner_->SetForcedSplit(&forced_splits_json_);
 
   // push training metrics
@@ -274,7 +282,7 @@ void GBDT::Bagging(int iter) {
         tmp_hessians_.resize(bag_gh_size);
       }
 
-      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
+      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
 
       tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_ );
     }
diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp
index adf99115626..89b9159b390 100644
--- a/src/io/dense_nbits_bin.hpp
+++ b/src/io/dense_nbits_bin.hpp
@@ -346,7 +346,7 @@ class Dense4bitsBin : public Bin {
     }
   }
 
-  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
+  void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
     auto other_bin = dynamic_cast<const Dense4bitsBin*>(full_bin);
     const data_size_t rest = num_used_indices & 1;
     for (int i = 0; i < num_used_indices - rest; i += 2) {

From 2ec9a412dcfb13d72e87f93611b2c554b665eaad Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 30 Mar 2020 22:07:12 +0000
Subject: [PATCH 006/119] Initial CUDA work

---
 include/LightGBM/tree_learner.h         |  3 ++-
 src/boosting/gbdt.cpp                   |  3 ++-
 src/boosting/rf.hpp                     |  2 +-
 src/treelearner/gpu_tree_learner.h      |  3 ++-
 src/treelearner/serial_tree_learner.cpp | 20 ++++++++++++++------
 src/treelearner/serial_tree_learner.h   |  9 ++++++---
 6 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index 3bc246e8426..6c549a5ed71 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -54,9 +54,10 @@ class TreeLearner {
   * \brief training tree model on dataset
   * \param gradients The first order gradients
   * \param hessians The second order gradients
+  * \param is_constant_hessian True if all hessians share the same value
   * \return A trained tree
   */
-  virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0;
+  virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, Json& forced_split_json) = 0;
 
   /*!
   * \brief use an existing tree to fit the new gradients and hessians.
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 58a1976d96b..a4f653bfa54 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -546,7 +546,8 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
         grad = gradients_.data() + offset;
         hess = hessians_.data() + offset;
       }
-      new_tree.reset(tree_learner_->Train(grad, hess));
+      // LGBM_CUDA
+      new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
     }
 
     if (new_tree->num_leaves() > 1) {
diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp
index 5c90202a515..e64bf6cb4d8 100644
--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -125,7 +125,7 @@ class RF : public GBDT {
           hess = tmp_hess_.data();
         }
 
-        new_tree.reset(tree_learner_->Train(grad, hess));
+        new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
       }
 
       if (new_tree->num_leaves() > 1) {
diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h
index a909c57cbad..ba48f030441 100644
--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -48,7 +48,8 @@ class GPUTreeLearner: public SerialTreeLearner {
   void Init(const Dataset* train_data, bool is_constant_hessian) override;
   void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
   void ResetIsConstantHessian(bool is_constant_hessian) override;
-  Tree* Train(const score_t* gradients, const score_t *hessians) override;
+  Tree* Train(const score_t* gradients, const score_t *hessians,
+              bool is_constant_hessian, Json& forced_split_json) override;
 
   void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
     SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 6b02411127a..5d2b9afff50 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -30,6 +30,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   num_features_ = train_data_->num_features();
+  is_constant_hessian_ = is_constant_hessian;
   int max_cache_size = 0;
   // Get the max size of pool
   if (config_->histogram_pool_size <= 0) {
@@ -148,10 +149,11 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
   constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves));
 }
 
-Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
+Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) {
   Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
   gradients_ = gradients;
   hessians_ = hessians;
+  is_constant_hessian_ = is_constant_hessian;
   int num_threads = OMP_NUM_THREADS();
   if (share_state_->num_threads != num_threads && share_state_->num_threads > 0) {
     Log::Warning(
@@ -175,7 +177,12 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
   // only root leaf can be splitted on first time
   int right_leaf = -1;
 
-  int init_splits = ForceSplits(tree_prt, &left_leaf, &right_leaf, &cur_depth);
+  int init_splits = 0;
+  bool aborted_last_force_split = false;
+  if (!forced_split_json.is_null()) {
+    init_splits = ForceSplits(tree.get(), forced_split_json, &left_leaf,
+                              &right_leaf, &cur_depth, &aborted_last_force_split);
+  }
 
   for (int split = init_splits; split < config_->num_leaves - 1; ++split) {
     // some initial works before finding best split
@@ -434,8 +441,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
   }
 }
 
-int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
-                                       int* right_leaf, int *cur_depth) {
+int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
+                                       int* right_leaf, int *cur_depth,
+                                       bool *aborted_last_force_split) {
   bool abort_last_forced_split = false;
   if (forced_split_json_ == nullptr) {
     return 0;
@@ -444,11 +452,11 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
   // start at root leaf
   *left_leaf = 0;
   std::queue<std::pair<Json, int>> q;
-  Json left = *forced_split_json_;
+  Json left = forced_split_json;
   Json right;
   bool left_smaller = true;
   std::unordered_map<int, SplitInfo> forceSplitMap;
-  q.push(std::make_pair(left, *left_leaf));
+  q.push(std::make_pair(forced_split_json, *left_leaf));
   while (!q.empty()) {
     // before processing next node from queue, store info for current left/right leaf
     // store "best split" for left and right, even if they might be overwritten by forced split
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index fab28542e03..367c262192c 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -79,7 +79,8 @@ class SerialTreeLearner: public TreeLearner {
     }
   }
 
-  Tree* Train(const score_t* gradients, const score_t *hessians) override;
+  Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian,
+              Json& forced_split_json) override;
 
   Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
 
@@ -162,8 +163,9 @@ class SerialTreeLearner: public TreeLearner {
                   bool update_cnt);
 
   /* Force splits with forced_split_json dict and then return num splits forced.*/
-  int32_t ForceSplits(Tree* tree, int* left_leaf, int* right_leaf,
-                      int* cur_depth);
+  virtual int32_t ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
+                              int* right_leaf, int* cur_depth,
+                              bool *aborted_last_force_split);
 
   /*!
   * \brief Get the number of data in a leaf
@@ -226,6 +228,7 @@ class SerialTreeLearner: public TreeLearner {
   const Json* forced_split_json_;
   std::unique_ptr<TrainingShareStates> share_state_;
   std::unique_ptr<CostEfficientGradientBoosting> cegb_;
+  bool is_constant_hessian_;
 };
 
 inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {

From 1023182a5ca5a0699f90477dad7c96c4f234575a Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Tue, 31 Mar 2020 17:10:42 +0000
Subject: [PATCH 007/119] Initial CUDA work

---
 build_LGBM.232.sh                     |  3 +--
 src/boosting/gbdt.cpp                 |  2 +-
 src/io/dense_nbits_bin.hpp            |  8 +++----
 src/treelearner/cuda_tree_learner.cpp | 32 +++++++++++++--------------
 src/treelearner/cuda_tree_learner.h   | 12 +++-------
 5 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/build_LGBM.232.sh b/build_LGBM.232.sh
index 24b50c7dfda..5e500327108 100755
--- a/build_LGBM.232.sh
+++ b/build_LGBM.232.sh
@@ -2,6 +2,5 @@
 rm -rf build
 mkdir build
 cd build
-#cmake -DUSE_CUDA=1 ..
-cmake ..
+cmake -DUSE_CUDA=1 ..
 make -j40
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index a4f653bfa54..580c52fd889 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -284,7 +284,7 @@ void GBDT::Bagging(int iter) {
 
       tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
 
-      tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_ );
+      tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_);
     }
   }
 }
diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp
index 89b9159b390..5eb215fad8a 100644
--- a/src/io/dense_nbits_bin.hpp
+++ b/src/io/dense_nbits_bin.hpp
@@ -75,7 +75,7 @@ class Dense4bitsBin : public Bin {
 
   void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
                           const score_t* ordered_gradients, const score_t* ordered_hessians,
-                          HistogramBinEntry* out) const override {
+                          hist_t* out) const override {
     const data_size_t rest = num_data & 0x3;
     data_size_t i = 0;
     for (; i < num_data - rest; i += 4) {
@@ -118,7 +118,7 @@ class Dense4bitsBin : public Bin {
 
   void ConstructHistogram(data_size_t num_data,
                           const score_t* ordered_gradients, const score_t* ordered_hessians,
-                          HistogramBinEntry* out) const override {
+                          hist_t* out) const override {
     const data_size_t rest = num_data & 0x3;
     data_size_t i = 0;
 
@@ -153,7 +153,7 @@ class Dense4bitsBin : public Bin {
 
   void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
                           const score_t* ordered_gradients,
-                          HistogramBinEntry* out) const override {
+                          hist_t* out) const override {
     const data_size_t rest = num_data & 0x3;
     data_size_t i = 0;
     for (; i < num_data - rest; i += 4) {
@@ -190,7 +190,7 @@ class Dense4bitsBin : public Bin {
 
   void ConstructHistogram(data_size_t num_data,
                           const score_t* ordered_gradients,
-                          HistogramBinEntry* out) const override {
+                          hist_t* out) const override {
     const data_size_t rest = num_data & 0x3;
     data_size_t i = 0;
     for (; i < num_data - rest; i += 4) {
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index c45df55cacf..d2064843ead 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -83,7 +83,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian,
 
 // some functions used for debugging the GPU histogram construction
 
-void PrintHistograms(HistogramBinEntry* h, size_t size) {
+void PrintHistograms(hist_t* h, size_t size) {
   size_t total = 0;
   for (size_t i = 0; i < size; ++i) {
     printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt);
@@ -104,7 +104,7 @@ union Float_t
 };
   
 
-void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
+void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
 
   size_t i;
   Float_t a, b;
@@ -221,7 +221,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
 
 
 template <typename HistType>
-void CUDATreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
+void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) {
   HistType* hist_outputs = (HistType*) host_histogram_outputs_;
 
   //#pragma omp parallel for schedule(static, num_gpu_)
@@ -321,7 +321,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
   memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_);
 
   // histogram bin entry size depends on the precision (single/double)
-  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry);
+  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t);
 
   // host_size histogram outputs
   //  host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_);
@@ -591,12 +591,12 @@ Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians,
   return ret;
 }
 
-void CUDATreeLearner::ResetTrainingData(const Dataset* train_data) {
+void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
 
   // LGBM_CUDA: check data size
   data_size_t old_num_data = num_data_;  
 
-  SerialTreeLearner::ResetTrainingData(train_data);
+  SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
 
   #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA 
   serial_time = std::chrono::steady_clock::now() - start_serial_time;  
@@ -889,7 +889,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   }
 
   // construct smaller leaf
-  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
 
   // Check workgroups per feature4 tuple..
   int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
@@ -918,11 +918,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   if (is_gpu_used) {
     if (config_->gpu_use_dp) {
       // use double precision
-      WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data);
+      WaitAndGetHistograms<hist_t>(ptr_smaller_leaf_hist_data);
     }
     else {
       // use single precision
-      WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data);
+      WaitAndGetHistograms<gpu_hist_t>(ptr_smaller_leaf_hist_data);
     }
   }
 
@@ -936,13 +936,13 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       continue;
     int dense_feature_group_index = dense_feature_group_map_[i];
     size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
-    HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
-    HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
-    HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size];
+    hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+    hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
+    hist_t* gpu_histogram = new hist_t[size];
     data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
 
     std::copy(current_histogram, current_histogram + size, gpu_histogram);
-    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry));
+    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(hist_t));
     if ( num_data == num_data_ ) {
       if ( is_constant_hessian_ ) {
         printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_");
@@ -991,7 +991,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 
     // construct larger leaf
 
-    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
+    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
 
     is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
       larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf());
@@ -1013,11 +1013,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     if (is_gpu_used) {
       if (config_->gpu_use_dp) {
         // use double precision
-        WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data);
+        WaitAndGetHistograms<hist_t>(ptr_larger_leaf_hist_data);
       }
       else {
         // use single precision
-        WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data);
+        WaitAndGetHistograms<gpu_hist_t>(ptr_larger_leaf_hist_data);
       }
     }
   }
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index e5a24aeb8f5..8a0e5d7cb20 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -39,7 +39,7 @@ class CUDATreeLearner: public SerialTreeLearner {
   ~CUDATreeLearner();
   // LGBM_CUDA: is_use_subset is used by CUDA only
   void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;
-  void ResetTrainingData(const Dataset* train_data) override;
+  void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
   Tree* Train(const score_t* gradients, const score_t *hessians,
               bool is_constant_hessian, Json& forced_split_json) override;
 
@@ -68,13 +68,7 @@ class CUDATreeLearner: public SerialTreeLearner {
   //    uint8_t s[4];
   //};
   
-  /*! \brief Single precision histogram entry for GPU */
-  struct GPUHistogramBinEntry {
-    score_t sum_gradients;
-    score_t sum_hessians;
-    uint32_t cnt;
-  };
-
+  typedef float gpu_hist_t;
 
   /*!
   * \brief Find the best number of workgroups processing one feature for maximizing efficiency
@@ -158,7 +152,7 @@ class CUDATreeLearner: public SerialTreeLearner {
    * \param histograms Destination of histogram results from GPU.
   */
   template <typename HistType>
-  void WaitAndGetHistograms(HistogramBinEntry* histograms);
+  void WaitAndGetHistograms(hist_t* histograms);
 
   /*!
    * \brief Construct GPU histogram asynchronously. 

From e50e4be629b064306b9771073e6f435e5e225da1 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Tue, 31 Mar 2020 18:18:43 +0000
Subject: [PATCH 008/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 8a0e5d7cb20..93bbdc483b7 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -43,11 +43,11 @@ class CUDATreeLearner: public SerialTreeLearner {
   Tree* Train(const score_t* gradients, const score_t *hessians,
               bool is_constant_hessian, Json& forced_split_json) override;
 
-  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
-    SerialTreeLearner::SetBaggingData(used_indices, num_data);
+  void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
+    SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
     // determine if we are using bagging before we construct the data partition
     // thus we can start data movement to GPU earlier
-    if (used_indices != nullptr) {
+    if (subset == nullptr && used_indices != nullptr) {
       if (num_data != num_data_) {
         use_bagging_ = true;
         return;

From 3d6201801911ba041d5ea9bdec55efe3d37cd8e3 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Tue, 31 Mar 2020 18:33:44 +0000
Subject: [PATCH 009/119] Initial CUDA work

---
 src/io/dense_nbits_bin.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp
index 5eb215fad8a..85ea5b311d2 100644
--- a/src/io/dense_nbits_bin.hpp
+++ b/src/io/dense_nbits_bin.hpp
@@ -310,7 +310,7 @@ class Dense4bitsBin : public Bin {
   void* get_data() override { return data_.data(); }
 
   /*! \brief not ordered bin for dense feature */
-  OrderedBin* CreateOrderedBin() const override { return nullptr; }
+  Bin* CreateDenseBin() const { return nullptr; }
 
   void FinishLoad() override {
     if (buf_.empty()) { return; }

From 7a6bf3321365ce8b8b4c354f9cae065956ab0045 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 31 Mar 2020 22:26:11 +0000
Subject: [PATCH 010/119] Initial CUDA work

---
 CMakeLists.txt | 81 ++++++++++++++++----------------------------------
 1 file changed, 26 insertions(+), 55 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d81828b640..104339756ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -184,53 +184,30 @@ if(USE_CUDA)
     message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES})
     message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES})
 
-    add_library(histo256_sp_const OBJECT src/treelearner/kernels/histogram256.cu)
-    set_target_properties(histo256_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_compile_definitions(
-      histo256_sp_const PRIVATE
-      -DCONST_HESSIAN=1
-      ${BASE_DEFINES}
-    )
-
-    add_library(histo256_sp OBJECT src/treelearner/kernels/histogram256.cu)
-    set_target_properties(histo256_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_compile_definitions(
-      histo256_sp PRIVATE
-      -DCONST_HESSIAN=0
-      ${BASE_DEFINES}
-    )
-
-    add_library(histo256-allfeats_sp_const OBJECT src/treelearner/kernels/histogram256.cu)
-    set_target_properties(histo256-allfeats_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_compile_definitions(
-      histo256-allfeats_sp_const PRIVATE
-      -DCONST_HESSIAN=1
-      ${ALLFEATS_DEFINES}
-    )
+    function(add_histogram hsize hname hadd hconst hdir)
+      add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu)
+      set_target_properties(histo${hsize}${hname} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+      if(hadd)
+        list(APPEND histograms histo${hsize}${hname})
+        set(histograms ${histograms} PARENT_SCOPE)
+      endif()
+      target_compile_definitions(
+        histo${hsize}${hname} PRIVATE
+        -DCONST_HESSIAN=${hconst}
+        ${hdir}
+      )
+    endfunction()
+
+    #foreach (hsize 16 64 256)
+    foreach (hsize 256)
+      add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}")
+      add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}")
+      add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}")
+      add_histogram("${hsize}" "-allfeats_sp" "False" "0" "${ALLFEATS_DEFINES}")
+      add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}")
+      add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}")
+    endforeach()
 
-    add_library(histo256-allfeats_sp OBJECT src/treelearner/kernels/histogram256.cu)
-    set_target_properties(histo256-allfeats_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_compile_definitions(
-      histo256-allfeats_sp PRIVATE
-      -DCONST_HESSIAN=0
-      ${ALLFEATS_DEFINES}
-    )
-
-    add_library(histo256-fulldata_sp_const OBJECT src/treelearner/kernels/histogram256.cu)
-    set_target_properties(histo256-fulldata_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_compile_definitions(
-      histo256-fulldata_sp_const PRIVATE
-      -DCONST_HESSIAN=1
-      ${FULLDATA_DEFINES}
-    )
-
-    add_library(histo256-fulldata_sp OBJECT src/treelearner/kernels/histogram256.cu)
-    set_target_properties(histo256-fulldata_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_compile_definitions(
-      histo256-fulldata_sp PRIVATE
-      -DCONST_HESSIAN=0
-      ${FULLDATA_DEFINES}
-    )
 endif(USE_CUDA)
 
 if(USE_HDFS)
@@ -340,7 +317,7 @@ file(GLOB SOURCES
     src/network/*.cpp
     src/treelearner/*.cpp
 #ifdef USE_CUDA
-    src/treelearner/*cu
+    src/treelearner/*.cu
 #endif
 )
 
@@ -417,17 +394,11 @@ endif(USE_GPU)
 if(USE_CUDA)
   TARGET_LINK_LIBRARIES(
     lightgbm
-    histo256_sp_const
-    histo256_sp
-    histo256-fulldata_sp_const
-    histo256-fulldata_sp
+    ${histograms}
   )
   TARGET_LINK_LIBRARIES(
     _lightgbm
-    histo256_sp_const
-    histo256_sp
-    histo256-fulldata_sp_const
-    histo256-fulldata_sp
+    ${histograms}
   )
 endif(USE_CUDA)
 

From 01b32269366888980303eb43b0edb2c2245e284c Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 1 Apr 2020 16:50:09 +0000
Subject: [PATCH 011/119] Initial CUDA work

---
 CMakeLists.txt          | 11 ++---------
 python-package/setup.py | 21 ++++++++++++++++++---
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 104339756ce..dc87f65bd67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,3 @@
-#LGBM_CUDA Added USE_CUDA flag
 if(USE_GPU OR APPLE)
   cmake_minimum_required(VERSION 3.2)
 elseif(USE_CUDA)
@@ -8,7 +7,6 @@ else()
   cmake_minimum_required(VERSION 2.8)
 endif()
 
-#LGBM_CUDA
 if(USE_CUDA)
   PROJECT(lightgbm LANGUAGES C CXX CUDA)
 else()
@@ -23,7 +21,8 @@ OPTION(USE_GPU "Enable GPU-accelerated training" OFF)
 OPTION(USE_SWIG "Enable SWIG to generate Java API" OFF)
 OPTION(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
 OPTION(USE_R35 "Set to ON if your R version is not earlier than 3.5" OFF)
-OPTION(USE_CUDA "Enable CUDA-accelerated training" OFF) # LGBM_CUDA
+OPTION(USE_TIMETAG "Set to ON to output time costs" OFF)
+OPTION(USE_CUDA "Enable CUDA-accelerated training" OFF)
 OPTION(USE_DEBUG "Set to ON for Debug mode" OFF)
 OPTION(BUILD_FOR_R "Set to ON if building lib_lightgbm for use with the R package" OFF)
 
@@ -138,7 +137,6 @@ if(USE_GPU)
     ADD_DEFINITIONS(-DUSE_GPU)
 endif(USE_GPU)
 
-#LGBM_CUDA CUDA-specific code
 if(USE_CUDA)
     find_package(CUDA REQUIRED)
     include_directories(${CUDA_INCLUDE_DIRS})
@@ -166,12 +164,10 @@ if(USE_CUDA)
      -DPOWER_FEATURE_WORKGROUPS=12
      -DUSE_CONSTANT_BUF=0
     )
-
     set(ALLFEATS_DEFINES
      ${BASE_DEFINES}
      -DENABLE_ALL_FEATURES
     )
-
     set(FULLDATA_DEFINES
      ${ALLFEATS_DEFINES}
      -DIGNORE_INDICES
@@ -180,7 +176,6 @@ if(USE_CUDA)
     #string(REPLACE ";" " " BASE_DEFINES "${BASE_DEFINES}")
     #string(REPLACE ";" " " ALLFEATS_DEFINES "${ALLFEATS_DEFINES}")
     #string(REPLACE ";" " " FULLDATA_DEFINES "${FULLDATA_DEFINES}")
-
     message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES})
     message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES})
 
@@ -307,7 +302,6 @@ if(USE_MPI)
   include_directories(${MPI_CXX_INCLUDE_PATH})
 endif(USE_MPI)
 
-#LGBM_CUDA
 file(GLOB SOURCES
     src/application/*.cpp
     src/boosting/*.cpp
@@ -390,7 +384,6 @@ if(USE_GPU)
   TARGET_LINK_LIBRARIES(_lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
 endif(USE_GPU)
 
-#LGBM_CUDA
 if(USE_CUDA)
   TARGET_LINK_LIBRARIES(
     lightgbm
diff --git a/python-package/setup.py b/python-package/setup.py
index 9d8853ddf94..f66475cb27d 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -86,10 +86,11 @@ def silent_call(cmd, raise_error=False, error_msg=''):
         return 1
 
 
-def compile_cpp(use_mingw=False, use_gpu=False, use_mpi=False,
+def compile_cpp(use_mingw=False, use_gpu=False, use_cuda=False, use_mpi=False,
                 use_hdfs=False, boost_root=None, boost_dir=None,
                 boost_include_dir=None, boost_librarydir=None,
                 opencl_include_dir=None, opencl_library=None,
+                openmp_include_dir=None, openmp_library=None,
                 nomp=False, bit32=False):
 
     if os.path.exists(os.path.join(CURRENT_DIR, "build_cpp")):
@@ -114,6 +115,12 @@ def compile_cpp(use_mingw=False, use_gpu=False, use_mpi=False,
             cmake_cmd.append("-DOpenCL_INCLUDE_DIR={0}".format(opencl_include_dir))
         if opencl_library:
             cmake_cmd.append("-DOpenCL_LIBRARY={0}".format(opencl_library))
+    elif use_cuda:
+        cmake_cmd.append("-DUSE_CUDA=ON")
+        if openmp_include_dir:
+            cmake_cmd.append("-DOpenMP_INCLUDE_DIR={0}".format(openmp_include_dir))
+        if openmp_library:
+            cmake_cmd.append("-DOpenMP_LIBRARY={0}".format(openmp_library))
     if use_mpi:
         cmake_cmd.append("-DUSE_MPI=ON")
     if nomp:
@@ -187,6 +194,7 @@ class CustomInstall(install):
     user_options = install.user_options + [
         ('mingw', 'm', 'Compile with MinGW'),
         ('gpu', 'g', 'Compile GPU version'),
+        ('cuda', 'c', 'Compile CUDA version'),
         ('mpi', None, 'Compile MPI version'),
         ('nomp', None, 'Compile version without OpenMP support'),
         ('hdfs', 'h', 'Compile HDFS version'),
@@ -197,21 +205,27 @@ class CustomInstall(install):
         ('boost-include-dir=', None, 'Directory containing Boost headers'),
         ('boost-librarydir=', None, 'Preferred Boost library directory'),
         ('opencl-include-dir=', None, 'OpenCL include directory'),
-        ('opencl-library=', None, 'Path to OpenCL library')
+        ('opencl-library=', None, 'Path to OpenCL library'),
+        ('openmp-include-dir=', None, 'OpenMP include directory'),
+        ('openmp-library=', None, 'Path to OpenMP library')
     ]
 
     def initialize_options(self):
         install.initialize_options(self)
         self.mingw = 0
         self.gpu = 0
+        self.cuda = 0
         self.boost_root = None
         self.boost_dir = None
         self.boost_include_dir = None
         self.boost_librarydir = None
         self.opencl_include_dir = None
         self.opencl_library = None
+        self.openmp_include_dir = None
+        self.openmp_library = None
         self.mpi = 0
         self.hdfs = 0
+        #self.precompile = 0 #TODO: revert this
         self.precompile = 1
         self.nomp = 0
         self.bit32 = 0
@@ -227,10 +241,11 @@ def run(self):
         open(LOG_PATH, 'wb').close()
         if not self.precompile:
             copy_files(use_gpu=self.gpu)
-            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_mpi=self.mpi,
+            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi,
                         use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir,
                         boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir,
                         opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library,
+                        openmp_include_dir=self.openmp_include_dir, openmp_library=self.openmp_library,
                         nomp=self.nomp, bit32=self.bit32)
         install.run(self)
         if os.path.isfile(LOG_PATH):

From 64dbb6b844d6598d5db83d729e1067a17e39942f Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 2 Apr 2020 13:09:52 +0000
Subject: [PATCH 012/119] Initial CUDA work

---
 include/LightGBM/c_api.h                 |  6 ++
 python-package/lightgbm/__init__.py      |  3 +-
 python-package/lightgbm/basic.py         |  5 ++
 python-package/setup.py                  |  2 +-
 src/c_api.cpp                            | 10 +++
 tests/python_package_test/test_basic.py  |  2 +
 tests/python_package_test/test_engine.py | 89 +++++++++++++++++++++++-
 7 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
index 9d7c6e61dd2..3fbccdac075 100644
--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -1076,6 +1076,12 @@ LIGHTGBM_C_EXPORT int LGBM_NetworkInitWithFunctions(int num_machines,
 #define THREAD_LOCAL thread_local  /*!< \brief Thread local specifier. */
 #endif
 
+/*!
+ *  * \brief Returns device type.
+ *   * \return 0 = CPU, 1 = GPU / OCL, 2 = CUDA
+ *    */
+LIGHTGBM_C_EXPORT int LGBM_GetDeviceType();
+
 /*!
  * \brief Handle of error message.
  * \return Error message
diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py
index 390a6994a7a..44a56ae03f5 100644
--- a/python-package/lightgbm/__init__.py
+++ b/python-package/lightgbm/__init__.py
@@ -5,7 +5,7 @@
 """
 from __future__ import absolute_import
 
-from .basic import Booster, Dataset
+from .basic import Booster, Dataset, get_device_type
 from .callback import (early_stopping, print_evaluation, record_evaluation,
                        reset_parameter)
 from .engine import cv, train
@@ -30,6 +30,7 @@
         __version__ = version_file.read().strip()
 
 __all__ = ['Dataset', 'Booster',
+           'get_device_type',
            'train', 'cv',
            'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
            'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index 01a5f31e51b..9dace6b768c 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -432,6 +432,11 @@ def _load_pandas_categorical(file_name=None, model_str=None):
         return None
 
 
+def get_device_type():
+    """Get device type."""
+    return _LIB.LGBM_GetDeviceType()
+
+
 class _InnerPredictor(object):
     """_InnerPredictor of LightGBM.
 
diff --git a/python-package/setup.py b/python-package/setup.py
index f66475cb27d..eca56783713 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -194,7 +194,7 @@ class CustomInstall(install):
     user_options = install.user_options + [
         ('mingw', 'm', 'Compile with MinGW'),
         ('gpu', 'g', 'Compile GPU version'),
-        ('cuda', 'c', 'Compile CUDA version'),
+        ('cuda', None, 'Compile CUDA version'),
         ('mpi', None, 'Compile MPI version'),
         ('nomp', None, 'Compile version without OpenMP support'),
         ('hdfs', 'h', 'Compile HDFS version'),
diff --git a/src/c_api.cpp b/src/c_api.cpp
index f785bc74f19..03a3db597bb 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -647,6 +647,16 @@ int LGBM_RegisterLogCallback(void (*callback)(const char*)) {
   API_END();
 }
 
+int LGBM_GetDeviceType() {
+#ifdef USE_GPU
+  return 1;
+#elif USE_CUDA
+  return 2;
+#else
+  return 0;     // CPU
+#endif
+}
+
 int LGBM_DatasetCreateFromFile(const char* filename,
                                const char* parameters,
                                const DatasetHandle reference,
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 85e9e728d70..d984c25f65f 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -29,6 +29,8 @@ def test(self):
             "max_bin": 255,
             "gpu_use_dp": True
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         bst = lgb.Booster(params, train_data)
         bst.add_valid(valid_data, "valid_1")
 
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index dc48fc9d3a3..37894815f4a 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -61,6 +61,8 @@ def test_binary(self):
             'verbose': -1,
             'num_iteration': 50  # test num_iteration in dict here
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
         evals_result = {}
@@ -87,6 +89,8 @@ def test_rf(self):
             'metric': 'binary_logloss',
             'verbose': -1
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
         evals_result = {}
@@ -106,6 +110,8 @@ def test_regression(self):
             'metric': 'l2',
             'verbose': -1
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
         evals_result = {}
@@ -133,6 +139,8 @@ def test_missing_value_handle(self):
             'verbose': -1,
             'boost_from_average': False
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=20,
@@ -188,6 +196,8 @@ def test_missing_value_handle_na(self):
             'min_data_in_bin': 1,
             'zero_as_missing': False
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -220,6 +230,8 @@ def test_missing_value_handle_zero(self):
             'min_data_in_bin': 1,
             'zero_as_missing': True
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -252,6 +264,8 @@ def test_missing_value_handle_none(self):
             'min_data_in_bin': 1,
             'use_missing': False
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -290,6 +304,8 @@ def test_categorical_handle(self):
             'zero_as_missing': True,
             'categorical_column': 0
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -327,6 +343,8 @@ def test_categorical_handle_na(self):
             'zero_as_missing': False,
             'categorical_column': 0
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -385,6 +403,8 @@ def test_multiclass(self):
             'num_class': 10,
             'verbose': -1
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, params=params)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
         evals_result = {}
@@ -401,6 +421,7 @@ def test_multiclass_rf(self):
         X, y = load_digits(10, True)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
         params = {
+            'device': 'cpu',
             'boosting_type': 'rf',
             'objective': 'multiclass',
             'metric': 'multi_logloss',
@@ -434,6 +455,8 @@ def test_multiclass_prediction_early_stopping(self):
             'num_class': 10,
             'verbose': -1
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, params=params)
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=50)
@@ -455,6 +478,8 @@ def test_multi_class_error(self):
         X, y = load_digits(10, True)
         params = {'objective': 'multiclass', 'num_classes': 10, 'metric': 'multi_error',
                   'num_leaves': 4, 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=10)
         predict_default = est.predict(X)
@@ -564,6 +589,8 @@ def test_early_stopping(self):
             'metric': 'binary_logloss',
             'verbose': -1
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
@@ -597,6 +624,8 @@ def test_continue_train(self):
             'metric': 'l1',
             'verbose': -1
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)
         init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
@@ -662,6 +691,8 @@ def test_continue_train_multiclass(self):
             'num_class': 3,
             'verbose': -1
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, params=params, free_raw_data=False)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params, free_raw_data=False)
         init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
@@ -718,6 +749,8 @@ def test_cv(self):
         q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                           '../../examples/lambdarank/rank.train.query'))
         params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3}
+        if lgb.get_device_type() == 2:
+            params_lambdarank["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
         # ... with l2 metric
         cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
@@ -771,6 +804,8 @@ def train_and_predict(init_model=None, return_model=False):
                 'metric': 'l2',
                 'verbose': -1
             }
+            if lgb.get_device_type() == 2:
+                params["device"] = "cuda"
             lgb_train = lgb.Dataset(X_train, y_train)
             gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model)
             return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test))
@@ -824,6 +859,8 @@ def test_pandas_categorical(self):
             'metric': 'binary_logloss',
             'verbose': -1
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X, y)
         gbm0 = lgb.train(params, lgb_train, num_boost_round=10)
         pred0 = gbm0.predict(X_test)
@@ -920,6 +957,8 @@ def test_reference_chain(self):
         tmp_dat_train = tmp_dat.subset(np.arange(80))
         tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18))
         params = {'objective': 'regression_l2', 'metric': 'rmse'}
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, tmp_dat_train, num_boost_round=20,
                         valid_sets=[tmp_dat_train, tmp_dat_val],
@@ -935,6 +974,8 @@ def test_contribs(self):
             'metric': 'binary_logloss',
             'verbose': -1,
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train)
         gbm = lgb.train(params, lgb_train, num_boost_round=20)
 
@@ -949,6 +990,8 @@ def train_and_get_predictions(features, labels):
                 'verbose': -1,
                 'min_data': 5,
             }
+            if lgb.get_device_type() == 2:
+               lgb_params["device"] = "cuda"
             gbm = lgb.train(
                 params=lgb_params,
                 train_set=dataset,
@@ -1095,6 +1138,8 @@ def is_correctly_constrained(learner, x3_to_category=True):
                     "monotone_constraints_method": monotone_constraints_method,
                     "use_missing": False,
                 }
+                if lgb.get_device_type() == 2:
+                    params["device"] = "cuda"
                 constrained_model = lgb.train(params, trainset)
                 self.assertTrue(is_correctly_constrained(constrained_model, test_with_categorical_variable))
 
@@ -1215,6 +1260,7 @@ def test_refit(self):
         X, y = load_breast_cancer(True)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
         params = {
+            'device': 'cpu',
             'objective': 'binary',
             'metric': 'binary_logloss',
             'verbose': -1,
@@ -1238,6 +1284,8 @@ def test_mape_rf(self):
             'feature_fraction': 0.8,
             'boost_from_average': True
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X, y)
         gbm = lgb.train(params, lgb_train, num_boost_round=20)
         pred = gbm.predict(X)
@@ -1255,6 +1303,8 @@ def test_mape_dart(self):
             'feature_fraction': 0.8,
             'boost_from_average': False
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X, y)
         gbm = lgb.train(params, lgb_train, num_boost_round=40)
         pred = gbm.predict(X)
@@ -1274,6 +1324,8 @@ def check_constant_features(self, y_true, expected_pred, more_params):
             'min_data_in_bin': 1,
             'boost_from_average': True
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         params.update(more_params)
         lgb_train = lgb.Dataset(X_train, y_train, params=params)
         gbm = lgb.train(params, lgb_train, num_boost_round=2)
@@ -1284,6 +1336,8 @@ def test_constant_features_regression(self):
         params = {
             'objective': 'regression'
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         self.check_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params)
         self.check_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params)
         self.check_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params)
@@ -1292,6 +1346,8 @@ def test_constant_features_binary(self):
         params = {
             'objective': 'binary'
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         self.check_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params)
         self.check_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params)
 
@@ -1300,6 +1356,8 @@ def test_constant_features_multiclass(self):
             'objective': 'multiclass',
             'num_class': 3
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
         self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
 
@@ -1308,6 +1366,8 @@ def test_constant_features_multiclassova(self):
             'objective': 'multiclassova',
             'num_class': 3
         }
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
         self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
 
@@ -1327,6 +1387,8 @@ def preprocess_data(dtrain, dtest, params):
         X, y = load_iris(True)
         dataset = lgb.Dataset(X, y, free_raw_data=False)
         params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data)
         self.assertIn('multi_logloss-mean', results)
         self.assertEqual(len(results['multi_logloss-mean']), 10)
@@ -1339,14 +1401,28 @@ def test_metrics(self):
 
         evals_result = {}
         params_verbose = {'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params_verbose["device"] = "cuda"
         params_obj_verbose = {'objective': 'binary', 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params_obj_verbose["device"] = "cuda"
         params_obj_metric_log_verbose = {'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params_obj_metric_log_verbose["device"] = "cuda"
         params_obj_metric_err_verbose = {'objective': 'binary', 'metric': 'binary_error', 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params_obj_metric_err_verbose["device"] = "cuda"
         params_obj_metric_inv_verbose = {'objective': 'binary', 'metric': 'invalid_metric', 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params_obj_metric_inv_verbose["device"] = "cuda"
         params_obj_metric_multi_verbose = {'objective': 'binary',
                                            'metric': ['binary_logloss', 'binary_error'],
                                            'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params_obj_metric_multi_verbose["device"] = "cuda"
         params_obj_metric_none_verbose = {'objective': 'binary', 'metric': 'None', 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params_obj_metric_none_verbose["device"] = "cuda"
         params_metric_log_verbose = {'metric': 'binary_logloss', 'verbose': -1}
         params_metric_err_verbose = {'metric': 'binary_error', 'verbose': -1}
         params_metric_inv_verbose = {'metric_types': 'invalid_metric', 'verbose': -1}
@@ -1355,7 +1431,6 @@ def test_metrics(self):
 
         def get_cv_result(params=params_obj_verbose, **kwargs):
             return lgb.cv(params, lgb_train, num_boost_round=2, verbose_eval=False, **kwargs)
-
         def train_booster(params=params_obj_verbose, **kwargs):
             lgb.train(params, lgb_train,
                       num_boost_round=2,
@@ -1564,6 +1639,8 @@ def train_booster(params=params_obj_verbose, **kwargs):
         # remove default metric by 'None' aliases
         for na_alias in ('None', 'na', 'null', 'custom'):
             params = {'objective': 'binary', 'metric': na_alias, 'verbose': -1}
+            if lgb.get_device_type() == 2:
+                params["device"] = "cuda"
             train_booster(params=params)
             self.assertEqual(len(evals_result), 0)
 
@@ -1644,8 +1721,14 @@ def train_booster(params=params_obj_verbose, **kwargs):
         obj_multi_aliases = ['multiclass', 'softmax', 'multiclassova', 'multiclass_ova', 'ova', 'ovr']
         for obj_multi_alias in obj_multi_aliases:
             params_obj_class_3_verbose = {'objective': obj_multi_alias, 'num_class': 3, 'verbose': -1}
+            if lgb.get_device_type() == 2:
+                params_obj_class_3_verbose["device"] = "cuda"
             params_obj_class_1_verbose = {'objective': obj_multi_alias, 'num_class': 1, 'verbose': -1}
+            if lgb.get_device_type() == 2:
+               params_obj_class_1_verbose["device"] = "cuda"
             params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1}
+            if lgb.get_device_type() == 2:
+               params_obj_verbose["device"] = "cuda"
             # multiclass default metric
             res = get_cv_result(params_obj_class_3_verbose)
             self.assertEqual(len(res), 2)
@@ -1686,6 +1769,8 @@ def train_booster(params=params_obj_verbose, **kwargs):
             self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
                               params_obj_class_3_verbose, metrics='binary_logloss')
         params_class_3_verbose = {'num_class': 3, 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params_class_3_verbose["device"] = "cuda"
         # non-default num_class for default objective
         self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
                           params_class_3_verbose)
@@ -1820,6 +1905,8 @@ def metrics_combination_train_regression(valid_sets, metric_list, assumed_iterat
                 'verbose': -1,
                 'seed': 123
             }
+            if lgb.get_device_type() == 2:
+                params["device"] = "cuda"
             gbm = lgb.train(dict(params, first_metric_only=first_metric_only), lgb_train,
                             num_boost_round=25, valid_sets=valid_sets, feval=feval,
                             early_stopping_rounds=5, verbose_eval=False)

From e17b345c561461f09331c1e7bb31618dc83e676c Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Thu, 2 Apr 2020 20:01:39 +0000
Subject: [PATCH 013/119] Initial CUDA work

---
 src/io/dense_nbits_bin.hpp            | 405 --------------------------
 src/treelearner/cuda_tree_learner.cpp |   1 -
 2 files changed, 406 deletions(-)
 delete mode 100644 src/io/dense_nbits_bin.hpp

diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp
deleted file mode 100644
index 85ea5b311d2..00000000000
--- a/src/io/dense_nbits_bin.hpp
+++ /dev/null
@@ -1,405 +0,0 @@
-/*!
- * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
-#define LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
-
-#include <LightGBM/bin.h>
-
-#include <cstdint>
-#include <cstring>
-#include <vector>
-
-namespace LightGBM {
-
-class Dense4bitsBin;
-
-class Dense4bitsBinIterator : public BinIterator {
- public:
-  explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
-    : bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
-    max_bin_(static_cast<uint8_t>(max_bin)),
-    default_bin_(static_cast<uint8_t>(default_bin)) {
-    if (default_bin_ == 0) {
-      bias_ = 1;
-    } else {
-      bias_ = 0;
-    }
-  }
-  inline uint32_t RawGet(data_size_t idx) override;
-  inline uint32_t Get(data_size_t idx) override;
-  inline void Reset(data_size_t) override {}
-
- private:
-  const Dense4bitsBin* bin_data_;
-  uint8_t min_bin_;
-  uint8_t max_bin_;
-  uint8_t default_bin_;
-  uint8_t bias_;
-};
-
-class Dense4bitsBin : public Bin {
- public:
-  friend Dense4bitsBinIterator;
-  Dense4bitsBin(data_size_t num_data)
-    : num_data_(num_data) {
-    int len = (num_data_ + 1) / 2;
-    data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
-    buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
-  }
-
-  ~Dense4bitsBin() {
-  }
-
-  void Push(int, data_size_t idx, uint32_t value) override {
-    const int i1 = idx >> 1;
-    const int i2 = (idx & 1) << 2;
-    const uint8_t val = static_cast<uint8_t>(value) << i2;
-    if (i2 == 0) {
-      data_[i1] = val;
-    } else {
-      buf_[i1] = val;
-    }
-  }
-
-  void ReSize(data_size_t num_data) override {
-    if (num_data_ != num_data) {
-      num_data_ = num_data;
-      const int len = (num_data_ + 1) / 2;
-      data_.resize(len);
-    }
-  }
-
-  inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
-
-  void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
-                          const score_t* ordered_gradients, const score_t* ordered_hessians,
-                          hist_t* out) const override {
-    const data_size_t rest = num_data & 0x3;
-    data_size_t i = 0;
-    for (; i < num_data - rest; i += 4) {
-      const data_size_t idx0 = data_indices[i];
-      const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf;
-
-      const data_size_t idx1 = data_indices[i + 1];
-      const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf;
-
-      const data_size_t idx2 = data_indices[i + 2];
-      const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf;
-
-      const data_size_t idx3 = data_indices[i + 3];
-      const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf;
-
-      out[bin0].sum_gradients += ordered_gradients[i];
-      out[bin1].sum_gradients += ordered_gradients[i + 1];
-      out[bin2].sum_gradients += ordered_gradients[i + 2];
-      out[bin3].sum_gradients += ordered_gradients[i + 3];
-
-      out[bin0].sum_hessians += ordered_hessians[i];
-      out[bin1].sum_hessians += ordered_hessians[i + 1];
-      out[bin2].sum_hessians += ordered_hessians[i + 2];
-      out[bin3].sum_hessians += ordered_hessians[i + 3];
-
-      ++out[bin0].cnt;
-      ++out[bin1].cnt;
-      ++out[bin2].cnt;
-      ++out[bin3].cnt;
-    }
-
-    for (; i < num_data; ++i) {
-      const data_size_t idx = data_indices[i];
-      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
-  }
-
-  void ConstructHistogram(data_size_t num_data,
-                          const score_t* ordered_gradients, const score_t* ordered_hessians,
-                          hist_t* out) const override {
-    const data_size_t rest = num_data & 0x3;
-    data_size_t i = 0;
-
-    for (; i < num_data - rest; i += 4) {
-      const auto bin0 = (data_[i >> 1]) & 0xf;
-      const auto bin1 = (data_[i >> 1] >> 4) & 0xf;
-      const auto bin2 = (data_[(i >> 1) + 1]) & 0xf;
-      const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf;
-
-      out[bin0].sum_gradients += ordered_gradients[i];
-      out[bin1].sum_gradients += ordered_gradients[i + 1];
-      out[bin2].sum_gradients += ordered_gradients[i + 2];
-      out[bin3].sum_gradients += ordered_gradients[i + 3];
-
-      out[bin0].sum_hessians += ordered_hessians[i];
-      out[bin1].sum_hessians += ordered_hessians[i + 1];
-      out[bin2].sum_hessians += ordered_hessians[i + 2];
-      out[bin3].sum_hessians += ordered_hessians[i + 3];
-
-      ++out[bin0].cnt;
-      ++out[bin1].cnt;
-      ++out[bin2].cnt;
-      ++out[bin3].cnt;
-    }
-    for (; i < num_data; ++i) {
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
-  }
-
-  void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
-                          const score_t* ordered_gradients,
-                          hist_t* out) const override {
-    const data_size_t rest = num_data & 0x3;
-    data_size_t i = 0;
-    for (; i < num_data - rest; i += 4) {
-      const data_size_t idx0 = data_indices[i];
-      const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf;
-
-      const data_size_t idx1 = data_indices[i + 1];
-      const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf;
-
-      const data_size_t idx2 = data_indices[i + 2];
-      const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf;
-
-      const data_size_t idx3 = data_indices[i + 3];
-      const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf;
-
-      out[bin0].sum_gradients += ordered_gradients[i];
-      out[bin1].sum_gradients += ordered_gradients[i + 1];
-      out[bin2].sum_gradients += ordered_gradients[i + 2];
-      out[bin3].sum_gradients += ordered_gradients[i + 3];
-
-      ++out[bin0].cnt;
-      ++out[bin1].cnt;
-      ++out[bin2].cnt;
-      ++out[bin3].cnt;
-    }
-
-    for (; i < num_data; ++i) {
-      const data_size_t idx = data_indices[i];
-      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-  }
-
-  void ConstructHistogram(data_size_t num_data,
-                          const score_t* ordered_gradients,
-                          hist_t* out) const override {
-    const data_size_t rest = num_data & 0x3;
-    data_size_t i = 0;
-    for (; i < num_data - rest; i += 4) {
-      const auto bin0 = (data_[i >> 1]) & 0xf;
-      const auto bin1 = (data_[i >> 1] >> 4) & 0xf;
-      const auto bin2 = (data_[(i >> 1) + 1]) & 0xf;
-      const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf;
-
-      out[bin0].sum_gradients += ordered_gradients[i];
-      out[bin1].sum_gradients += ordered_gradients[i + 1];
-      out[bin2].sum_gradients += ordered_gradients[i + 2];
-      out[bin3].sum_gradients += ordered_gradients[i + 3];
-
-      ++out[bin0].cnt;
-      ++out[bin1].cnt;
-      ++out[bin2].cnt;
-      ++out[bin3].cnt;
-    }
-    for (; i < num_data; ++i) {
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-  }
-
-  virtual data_size_t Split(
-    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
-    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
-    data_size_t* lte_indices, data_size_t* gt_indices) const override {
-    if (num_data <= 0) { return 0; }
-    uint8_t th = static_cast<uint8_t>(threshold + min_bin);
-    const uint8_t minb = static_cast<uint8_t>(min_bin);
-    const uint8_t maxb = static_cast<uint8_t>(max_bin);
-    uint8_t t_default_bin = static_cast<uint8_t>(min_bin + default_bin);
-    if (default_bin == 0) {
-      th -= 1;
-      t_default_bin -= 1;
-    }
-    data_size_t lte_count = 0;
-    data_size_t gt_count = 0;
-    data_size_t* default_indices = gt_indices;
-    data_size_t* default_count = &gt_count;
-    if (missing_type == MissingType::NaN) {
-      if (default_bin <= threshold) {
-        default_indices = lte_indices;
-        default_count = &lte_count;
-      }
-      data_size_t* missing_default_indices = gt_indices;
-      data_size_t* missing_default_count = &gt_count;
-      if (default_left) {
-        missing_default_indices = lte_indices;
-        missing_default_count = &lte_count;
-      }
-      for (data_size_t i = 0; i < num_data; ++i) {
-        const data_size_t idx = data_indices[i];
-        const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-        if (bin < minb || bin > maxb || t_default_bin == bin) {
-          default_indices[(*default_count)++] = idx;
-        } else if (bin == maxb) {
-          missing_default_indices[(*missing_default_count)++] = idx;
-        } else if (bin > th) {
-          gt_indices[gt_count++] = idx;
-        } else {
-          lte_indices[lte_count++] = idx;
-        }
-      }
-    } else {
-      if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
-        default_indices = lte_indices;
-        default_count = &lte_count;
-      }
-      for (data_size_t i = 0; i < num_data; ++i) {
-        const data_size_t idx = data_indices[i];
-        const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-        if (bin < minb || bin > maxb || t_default_bin == bin) {
-          default_indices[(*default_count)++] = idx;
-        } else if (bin > th) {
-          gt_indices[gt_count++] = idx;
-        } else {
-          lte_indices[lte_count++] = idx;
-        }
-      }
-    }
-    return lte_count;
-  }
-
-  virtual data_size_t SplitCategorical(
-    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
-    const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data,
-    data_size_t* lte_indices, data_size_t* gt_indices) const override {
-    if (num_data <= 0) { return 0; }
-    data_size_t lte_count = 0;
-    data_size_t gt_count = 0;
-    data_size_t* default_indices = gt_indices;
-    data_size_t* default_count = &gt_count;
-    if (Common::FindInBitset(threshold, num_threahold, default_bin)) {
-      default_indices = lte_indices;
-      default_count = &lte_count;
-    }
-    for (data_size_t i = 0; i < num_data; ++i) {
-      const data_size_t idx = data_indices[i];
-      const uint32_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      if (bin < min_bin || bin > max_bin) {
-        default_indices[(*default_count)++] = idx;
-      } else if (Common::FindInBitset(threshold, num_threahold, bin - min_bin)) {
-        lte_indices[lte_count++] = idx;
-      } else {
-        gt_indices[gt_count++] = idx;
-      }
-    }
-    return lte_count;
-  }
-
-  data_size_t num_data() const override { return num_data_; }
-
-  // LGBM_CUDA
-  void* get_data() override { return data_.data(); }
-
-  /*! \brief not ordered bin for dense feature */
-  Bin* CreateDenseBin() const { return nullptr; }
-
-  void FinishLoad() override {
-    if (buf_.empty()) { return; }
-    int len = (num_data_ + 1) / 2;
-    for (int i = 0; i < len; ++i) {
-      data_[i] |= buf_[i];
-    }
-    buf_.clear();
-  }
-
-  void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
-    const uint8_t* mem_data = reinterpret_cast<const uint8_t*>(memory);
-    if (!local_used_indices.empty()) {
-      const data_size_t rest = num_data_ & 1;
-      for (int i = 0; i < num_data_ - rest; i += 2) {
-        // get old bins
-        data_size_t idx = local_used_indices[i];
-        const auto bin1 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
-        idx = local_used_indices[i + 1];
-        const auto bin2 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
-        // add
-        const int i1 = i >> 1;
-        data_[i1] = (bin1 | (bin2 << 4));
-      }
-      if (rest) {
-        data_size_t idx = local_used_indices[num_data_ - 1];
-        data_[num_data_ >> 1] = (mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      }
-    } else {
-      for (size_t i = 0; i < data_.size(); ++i) {
-        data_[i] = mem_data[i];
-      }
-    }
-  }
-
-  void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
-    auto other_bin = dynamic_cast<const Dense4bitsBin*>(full_bin);
-    const data_size_t rest = num_used_indices & 1;
-    for (int i = 0; i < num_used_indices - rest; i += 2) {
-      data_size_t idx = used_indices[i];
-      const auto bin1 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
-      idx = used_indices[i + 1];
-      const auto bin2 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
-      const int i1 = i >> 1;
-      data_[i1] = (bin1 | (bin2 << 4));
-    }
-    if (rest) {
-      data_size_t idx = used_indices[num_used_indices - 1];
-      data_[num_used_indices >> 1] = (other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-    }
-  }
-
-  void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
-    writer->Write(data_.data(), sizeof(uint8_t) * data_.size());
-  }
-
-  size_t SizesInByte() const override {
-    return sizeof(uint8_t) * data_.size();
-  }
-
-  Dense4bitsBin* Clone() override {
-    return new Dense4bitsBin(*this);
-  }
-
- protected:
-  Dense4bitsBin(const Dense4bitsBin& other)
-    : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {}
-
-  data_size_t num_data_;
-  std::vector<uint8_t> data_;
-  std::vector<uint8_t> buf_;
-};
-
-uint32_t Dense4bitsBinIterator::Get(data_size_t idx) {
-  const auto bin = (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-  if (bin >= min_bin_ && bin <= max_bin_) {
-    return bin - min_bin_ + bias_;
-  } else {
-    return default_bin_;
-  }
-}
-
-uint32_t Dense4bitsBinIterator::RawGet(data_size_t idx) {
-  return (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-}
-
-inline BinIterator* Dense4bitsBin::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
-  return new Dense4bitsBinIterator(this, min_bin, max_bin, default_bin);
-}
-
-}  // namespace LightGBM
-#endif   // LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index d2064843ead..05c440a8e25 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -1,7 +1,6 @@
 #ifdef USE_CUDA
 #include "cuda_tree_learner.h"
 #include "../io/dense_bin.hpp"
-#include "../io/dense_nbits_bin.hpp"
 
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/network.h>

From 32825e525ca5758230275c8245b4abd9320dd89e Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 2 Apr 2020 13:44:01 +0000
Subject: [PATCH 014/119] Initial CUDA work

---
 test_LGBM.232.sh | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100755 test_LGBM.232.sh

diff --git a/test_LGBM.232.sh b/test_LGBM.232.sh
new file mode 100755
index 00000000000..cd5146f959f
--- /dev/null
+++ b/test_LGBM.232.sh
@@ -0,0 +1,5 @@
+python -m unittest tests/python_package_test/test_basic.py
+python -m unittest tests/python_package_test/test_consistency.py
+python -m unittest tests/python_package_test/test_engine.py
+python -m unittest tests/python_package_test/test_plotting.py
+python -m unittest tests/python_package_test/test_sklearn.py

From 169a7341147de02d1e854e4b00c6a6134a88808a Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 2 Apr 2020 18:32:22 +0000
Subject: [PATCH 015/119] Initial CUDA work

---
 include/LightGBM/cuda/cuda_utils.h      | 16 +++-------------
 include/LightGBM/cuda/vector_cudahost.h | 16 +++-------------
 2 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index 6d9407613f6..e57d3746a21 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -1,17 +1,7 @@
-/*
- * ibmGBT: IBM CUDA Accelerated LightGBM
- *
- * IBM Confidential
- * (C) Copyright IBM Corp. 2019. All Rights Reserved.
- *
- * The source code for this program is not published or otherwise
- * divested of its trade secrets, irrespective of what has been
- * deposited with the U.S. Copyright Office.
- *
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
+/*!
+ * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #ifndef LGBM_CUDA_UTILS_H
 #define LGBM_CUDA_UTILS_H
 
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index b1a235e8a22..5159a01f030 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -1,17 +1,7 @@
-/*
- * ibmGBT: IBM CUDA Accelerated LightGBM 
- *
- * IBM Confidential
- * (C) Copyright IBM Corp. 2019. All Rights Reserved.
- *
- * The source code for this program is not published or otherwise
- * divested of its trade secrets, irrespective of what has been
- * deposited with the U.S. Copyright Office.
- *
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
+/*!
+ * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #ifndef LGBM_CUDA_VECTOR_CH_H
 #define LGBM_CUDA_VECTOR_CH_H
 

From 3c83274fdcf80c48a579fee868112049503e8e82 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Fri, 3 Apr 2020 17:08:49 +0000
Subject: [PATCH 016/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 73 ++++++++++++++-------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 05c440a8e25..df8cefd5cea 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -81,6 +81,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian,
 }
 
 // some functions used for debugging the GPU histogram construction
+#if GPU_DEBUG > 0
 
 void PrintHistograms(hist_t* h, size_t size) {
   size_t total = 0;
@@ -103,7 +104,7 @@ union Float_t
 };
   
 
-void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
+void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
 
   size_t i;
   Float_t a, b;
@@ -137,6 +138,8 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
   Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i);
 }
 
+#endif
+
 int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
 
   // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
@@ -230,6 +233,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) {
 
     // when the output is ready, the computation is done
     CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id]));
+    // LGBM_CUDA
   }
 
   #pragma omp parallel for schedule(static)
@@ -242,12 +246,10 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) {
     int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); 
 
     for (int j = 0; j < bin_size; ++j) {
-      old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
-      old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians;
-      old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
+      GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
+      GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j);
     }
   }
-
 }
 
 // LGBM_CUDA
@@ -256,7 +258,7 @@ void CUDATreeLearner::CountDenseFeatureGroups() {
   num_dense_feature_groups_ = 0;
 
   for (int i = 0; i < num_feature_groups_; ++i) {
-    if (ordered_bins_[i] == nullptr) {
+    if (!train_data_->IsMultiGroup(i)) {
       num_dense_feature_groups_++;
     }
   }
@@ -435,7 +437,7 @@ void CUDATreeLearner::copyDenseFeature() {
 
   for (int i = 0; i < num_feature_groups_; ++i) {
     // looking for dword_features_ non-sparse feature-groups
-    if (ordered_bins_[i] == nullptr) {
+    if (!train_data_->IsMultiGroup(i)) {
       dense_feature_group_map_.push_back(i);
       auto sizes_in_byte = train_data_->FeatureGroupSizesInByte(i);
       void* tmp_data = train_data_->FeatureGroupData(i);
@@ -548,6 +550,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   // for debuging
   kernel_time_.resize(num_gpu_, 0);
   kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
+  //kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
 
   for(int i = 0; i < num_gpu_; ++i) {
     CUDASUCCESS_OR_FATAL(cudaSetDevice(i));
@@ -865,7 +868,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
 
 void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
 
-  //LGBM_CUDA
+  // LGBM_CUDA
   auto start_time = std::chrono::steady_clock::now();
 
   std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
@@ -875,9 +878,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   #pragma omp parallel for schedule(static)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
 
-    if (!is_feature_used_[feature_index]) continue;
     if (!is_feature_used[feature_index]) continue;
-    if (ordered_bins_[train_data_->Feature2Group(feature_index)]) {
+    if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
       is_sparse_feature_used[feature_index] = 1;
       num_sparse_features++;
     }
@@ -905,12 +907,19 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   // then construct sparse features on CPU
   // We set data_indices to null to avoid rebuilding ordered gradients/hessians
   if (num_sparse_features > 0){
+//  train_data_->ConstructHistograms(is_sparse_feature_used,
+//    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
+//    smaller_leaf_splits_->leaf_index(),
+//    ordered_bins_, gradients_, hessians_,
+//    ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+//    ptr_smaller_leaf_hist_data);
   train_data_->ConstructHistograms(is_sparse_feature_used,
-    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
-    smaller_leaf_splits_->LeafIndex(),
-    ordered_bins_, gradients_, hessians_,
-    ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
+    gradients_, hessians_,
+    ordered_gradients_.data(), ordered_hessians_.data(),
+    share_state_.get(),
     ptr_smaller_leaf_hist_data);
+
   }
 
   // wait for GPU to finish, only if GPU is actually used
@@ -935,13 +944,13 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       continue;
     int dense_feature_group_index = dense_feature_group_map_[i];
     size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
-    hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
-    hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
-    hist_t* gpu_histogram = new hist_t[size];
+    HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+    HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
+    HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size];
     data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
 
     std::copy(current_histogram, current_histogram + size, gpu_histogram);
-    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(hist_t));
+    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry));
     if ( num_data == num_data_ ) {
       if ( is_constant_hessian_ ) {
         printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_");
@@ -999,12 +1008,18 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     // We set data_indices to null to avoid rebuilding ordered gradients/hessians
 
     if (num_sparse_features > 0){
+    //train_data_->ConstructHistograms(is_sparse_feature_used,
+    //  nullptr, larger_leaf_splits_->num_data_in_leaf(),
+    //  larger_leaf_splits_->leaf_index(),
+    //  ordered_bins_, gradients_, hessians_,
+    //  ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+    //  ptr_larger_leaf_hist_data);
     train_data_->ConstructHistograms(is_sparse_feature_used,
-      nullptr, larger_leaf_splits_->num_data_in_leaf(),
-      larger_leaf_splits_->LeafIndex(),
-      ordered_bins_, gradients_, hessians_,
-      ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
-      ptr_larger_leaf_hist_data);
+      smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
+      gradients_, hessians_,
+      ordered_gradients_.data(), ordered_hessians_.data(),
+      share_state_.get(),
+      ptr_smaller_leaf_hist_data);
     }
 
     // wait for GPU to finish, only if GPU is actually used
@@ -1037,7 +1052,7 @@ void CUDATreeLearner::FindBestSplits() {
     size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; 
     printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d smaller leaf:\n", feature_index, bin_size);
     PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
-    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
+    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; }
     printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d larger leaf:\n", feature_index, bin_size);
 
     PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
@@ -1047,13 +1062,10 @@ void CUDATreeLearner::FindBestSplits() {
 
 void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
   const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
-
 #if GPU_DEBUG >= 2
   printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
 #endif
-
   SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
-
   if (Network::num_machines() == 1) {
     // do some sanity check for the GPU algorithm
     if (best_split_info.left_count < best_split_info.right_count) {
@@ -1062,21 +1074,14 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ
         Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
       }
     } else {
-      double smaller_min = smaller_leaf_splits_->min_constraint();
-      double smaller_max = smaller_leaf_splits_->max_constraint();
-      double larger_min = larger_leaf_splits_->min_constraint();
-      double larger_max = larger_leaf_splits_->max_constraint();
       smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
       larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
-      smaller_leaf_splits_->SetValueConstraint(smaller_min, smaller_max);
-      larger_leaf_splits_->SetValueConstraint(larger_min, larger_max);
       if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
           (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
         Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
       }
     }
   }
-
 }
 
 }   // namespace LightGBM

From 5b3f36acf6a7bbb7deb3fe14776ae3d12d7f90a0 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 6 Apr 2020 02:11:44 +0000
Subject: [PATCH 017/119] Initial CUDA work

---
 src/boosting/gbdt.cpp | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 580c52fd889..06af0fcb1f1 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -93,20 +93,6 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
 
   is_constant_hessian_ = GetIsConstHessian(objective_function);
 
-  tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get()));
-
-  // init tree learner
-  // LGBM_CUDA do not copy feature is is_use_subset for initialization
-  // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_)
-
-  if (config_->device_type == std::string("cuda")) {
-     tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_);
-  } else {
-    tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_);
-  }
-
-  tree_learner_->SetForcedSplit(&forced_splits_json_);
-
   // push training metrics
   training_metrics_.clear();
   for (const auto& metric : training_metrics) {
@@ -132,24 +118,31 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   feature_infos_ = train_data_->feature_infos();
   monotone_constraints_ = config->monotone_constraints;
 
-  // if need bagging, create buffer
-  // LGBM_CUDA: for CUDA implementation, this function sets up the is_use_subset_ flag as TRUE and tmp_subset_ is allocated.
-  ResetBaggingConfig(config_.get(), true);
-
   // LGBM_CUDA
   // Two key changes: position of the initializer is moved from the original code, and init() uses is_use_subset_ flag
   tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get()));
 
+  // if need bagging, create buffer
+  // LGBM_CUDA: for CUDA implementation, this function sets up the is_use_subset_ flag as TRUE and tmp_subset_ is allocated.
+  ResetBaggingConfig(config_.get(), true);
+
   // init tree learner
   // LGBM_CUDA do not copy feature is is_use_subset for initialization
   // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_)
 
   if (config_->device_type == std::string("cuda")) {
-    tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_);
+    if (is_use_subset_) {
+      tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_);
+    }
+    else {
+      tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_);
+    }
   } else {
     tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_);
   }
 
+  tree_learner_->SetForcedSplit(&forced_splits_json_);
+
   class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
   if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
     CHECK_EQ(num_tree_per_iteration_, num_class_);

From 91a312fab265253e198dd7d9f90ecc48d52ca439 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 7 Apr 2020 14:52:32 +0000
Subject: [PATCH 018/119] Initial CUDA work

---
 tests/python_package_test/test_engine.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 37894815f4a..51ab32a239b 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1138,8 +1138,6 @@ def is_correctly_constrained(learner, x3_to_category=True):
                     "monotone_constraints_method": monotone_constraints_method,
                     "use_missing": False,
                 }
-                if lgb.get_device_type() == 2:
-                    params["device"] = "cuda"
                 constrained_model = lgb.train(params, trainset)
                 self.assertTrue(is_correctly_constrained(constrained_model, test_with_categorical_variable))
 

From b05afeb5754c54217722477c57eea052371bef56 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Wed, 15 Apr 2020 13:56:31 +0000
Subject: [PATCH 019/119] Initial CUDA work

---
 include/LightGBM/cuda/vector_cudahost.h |  1 +
 src/boosting/gbdt.cpp                   |  8 +++++++-
 src/io/config.cpp                       |  2 +-
 src/treelearner/cuda_tree_learner.cpp   | 25 +++++++++++--------------
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index 5159a01f030..d95a353c246 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -43,6 +43,7 @@ struct CHAllocator {
       if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){
           cudaError_t ret= cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
           if (ret != cudaSuccess){
+fprintf(stderr, "   TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr);
              ptr = (T*) malloc(n*sizeof(T));
           }
       }
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 06af0fcb1f1..02f659e9d83 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -425,6 +425,12 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
       auto hess = hessians + offset;
 
       // LGBM_CUDA
+      if (((tmp_gradients_.data() == 0) || (tmp_hessians_.data() == 0)) && (config_->device_type == std::string("cuda"))) {
+        size_t bag_gh_size = static_cast<size_t>(bag_data_cnt_) * num_tree_per_iteration_;
+        tmp_gradients_.resize(bag_gh_size);
+        tmp_hessians_.resize(bag_gh_size);
+      }
+
       auto tmp_grad = tmp_gradients_.data();
       auto tmp_hess = tmp_hessians_.data();
 
@@ -436,7 +442,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
           tmp_grad[i] = grad[bag_data_indices_[i]]; // LGBM_CUDA
           tmp_hess[i] = hess[bag_data_indices_[i]]; // LGBM_CUDA
         }
-     }
+      }
 
       // LGBM_CUDA
       new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_));
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 5d2faba6133..963ef084578 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -322,7 +322,7 @@ void Config::CheckParamConflict() {
     }
   }
   // force col-wise for gpu
-  if (device_type == std::string("gpu")) { // GCF maybe need to add some cuda here?
+  if (device_type == std::string("gpu")) {
     force_col_wise = true;
     force_row_wise = false;
   }
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index df8cefd5cea..6ec4bf8b359 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -63,7 +63,6 @@ CUDATreeLearner::~CUDATreeLearner() {
 
 void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
 
-
   // initialize SerialTreeLearner
   SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset);
 
@@ -212,13 +211,9 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
     size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
     size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
 
-
     CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
-
-
     CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id]));
   }
-
 }
 
 
@@ -337,7 +332,6 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
 // LGBM_CUDA: allocate GPU memory for each GPU
 void CUDATreeLearner::AllocateGPUMemory() {
 
-
   #pragma omp parallel for schedule(static, num_gpu_)
 
   for(int device_id = 0; device_id < num_gpu_; ++device_id) {
@@ -400,7 +394,7 @@ void CUDATreeLearner::AllocateGPUMemory() {
       // create atomic counters for inter-group coordination
       CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id]));
       CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int))); 
-      CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int)));
+      CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int), stream_[device_id]));
 
       // The output buffer is allocated to host directly, to overlap compute and data transfer
       CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id]));
@@ -452,7 +446,7 @@ void CUDATreeLearner::copyDenseFeature() {
          copied_feature = 0;
          if(device_id < num_gpu_) {
            device_features = device_features_[device_id];
-           //CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); 
+           CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); 
          }
       }
     }
@@ -726,18 +720,22 @@ void CUDATreeLearner::BeforeTrain() {
 
       if (!is_constant_hessian_) {
 
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+void *foo = malloc(num_data_ * sizeof(score_t));
+memcpy(foo, &(hessians_[0]), num_data_ * sizeof(score_t));
+        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], foo, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+free(foo);
         CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
 
       }
 
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+void *foo = malloc(num_data_ * sizeof(score_t));
+memcpy(foo, &(gradients_[0]), num_data_ * sizeof(score_t));
+      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], foo, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+free(foo);
       CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
     }
-
   }
   }
-
 }
 
 bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
@@ -855,8 +853,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
     //#pragma omp parallel for schedule(static, num_gpu_)
     for(int device_id = 0; device_id < num_gpu_; ++device_id) {
       int offset = offset_gpu_feature_groups_[device_id];
-      CUDASUCCESS_OR_FATAL(cudaMemcpy(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice));
-      //CUDASUCCESS_OR_FATAL(cudaMemcpy(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id]));
+      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id]));
     }
 
   // All data have been prepared, now run the GPU kernel

From 013631787a246e8177e21f3279ab0c88609c1f39 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Thu, 16 Apr 2020 19:40:26 +0000
Subject: [PATCH 020/119] Initial CUDA work

---
 src/boosting/gbdt.cpp                 | 18 +++++++++---------
 src/treelearner/cuda_tree_learner.cpp |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 02f659e9d83..b2fb636ac4e 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -270,9 +270,9 @@ void GBDT::Bagging(int iter) {
       bool resized= tmp_subset_->ReSize(bag_data_cnt_);
 
      if (resized && (config_->device_type == std::string("cuda"))) {
-        size_t bag_gh_size = static_cast<size_t>(bag_data_cnt_) * num_tree_per_iteration_;
-        tmp_gradients_.resize(bag_gh_size);
-        tmp_hessians_.resize(bag_gh_size);
+        size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+        tmp_gradients_.resize(total_size);
+        tmp_hessians_.resize(total_size);
       }
 
       tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
@@ -426,9 +426,9 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
 
       // LGBM_CUDA
       if (((tmp_gradients_.data() == 0) || (tmp_hessians_.data() == 0)) && (config_->device_type == std::string("cuda"))) {
-        size_t bag_gh_size = static_cast<size_t>(bag_data_cnt_) * num_tree_per_iteration_;
-        tmp_gradients_.resize(bag_gh_size);
-        tmp_hessians_.resize(bag_gh_size);
+        size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+        tmp_gradients_.resize(total_size);
+        tmp_hessians_.resize(total_size);
       }
 
       auto tmp_grad = tmp_gradients_.data();
@@ -971,9 +971,9 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
        if (tmp_subset_ == nullptr){
           tmp_subset_.reset(new Dataset(bag_data_cnt_));
           tmp_subset_->CopyFeatureMapperFrom(train_data_);
-          size_t bag_gh_size = static_cast<size_t>(bag_data_cnt_) * num_tree_per_iteration_;
-          tmp_gradients_.resize(bag_gh_size);
-          tmp_hessians_.resize(bag_gh_size);
+          size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+          tmp_gradients_.resize(total_size);
+          tmp_hessians_.resize(total_size);
           is_use_subset_ = false;
           bag_data_indices_.clear();
        }
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 6ec4bf8b359..f2ca4717598 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -317,7 +317,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
   memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_);
 
   // histogram bin entry size depends on the precision (single/double)
-  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t);
+  hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t)); // two elements in this "size"
 
   // host_size histogram outputs
   //  host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_);

From f4b10571bb539103e0d9711c918c053af0ec211f Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 20 Apr 2020 14:59:54 +0000
Subject: [PATCH 021/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index f2ca4717598..e789462fe13 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -720,18 +720,12 @@ void CUDATreeLearner::BeforeTrain() {
 
       if (!is_constant_hessian_) {
 
-void *foo = malloc(num_data_ * sizeof(score_t));
-memcpy(foo, &(hessians_[0]), num_data_ * sizeof(score_t));
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], foo, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-free(foo);
+        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], (void *) &(hessians_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
         CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
 
       }
 
-void *foo = malloc(num_data_ * sizeof(score_t));
-memcpy(foo, &(gradients_[0]), num_data_ * sizeof(score_t));
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], foo, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-free(foo);
+      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], (void *) &(gradients_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
       CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
     }
   }

From 82e496877456ca2a57d5592448d96e3952b32c2b Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 20 Apr 2020 17:27:15 +0000
Subject: [PATCH 022/119] Initial CUDA work

---
 CMakeLists.txt                                                  | 2 +-
 src/treelearner/cuda_kernel_launcher.h                          | 2 +-
 .../kernels/{histogram256.cu => histogram_16_64_256.cu}         | 2 +-
 .../kernels/{histogram256.hu => histogram_16_64_256.hu}         | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename src/treelearner/kernels/{histogram256.cu => histogram_16_64_256.cu} (99%)
 rename src/treelearner/kernels/{histogram256.hu => histogram_16_64_256.hu} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc87f65bd67..7bcd068f3ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,7 +194,7 @@ if(USE_CUDA)
     endfunction()
 
     #foreach (hsize 16 64 256)
-    foreach (hsize 256)
+    foreach (hsize _16_64_256)
       add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}")
       add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}")
       add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}")
diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h
index ae7d3498e83..f63cfa97a08 100644
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
@@ -3,7 +3,7 @@
 
 #ifdef USE_CUDA
 // what should I include??
-#include "kernels/histogram256.hu" // kernel, acc_type, data_size_t, uchar, score_t
+#include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t
 #include <chrono>
 
 struct ThreadData {
diff --git a/src/treelearner/kernels/histogram256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
similarity index 99%
rename from src/treelearner/kernels/histogram256.cu
rename to src/treelearner/kernels/histogram_16_64_256.cu
index 5d659f8e2cf..08195d855de 100644
--- a/src/treelearner/kernels/histogram256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -12,7 +12,7 @@
  * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  */
 
-#include "histogram256.hu"
+#include "histogram_16_64_256.hu"
 #include "stdio.h"
 
 #define PRINT(b,t,fmt,...) \
diff --git a/src/treelearner/kernels/histogram256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
similarity index 100%
rename from src/treelearner/kernels/histogram256.hu
rename to src/treelearner/kernels/histogram_16_64_256.hu

From 512a0a3600e81991a2b19db759bcb8964748100c Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Tue, 21 Apr 2020 14:06:54 +0000
Subject: [PATCH 023/119] Initial CUDA work

---
 src/treelearner/cuda_kernel_launcher.cu       | 292 ++++----
 src/treelearner/cuda_kernel_launcher.h        |   2 +
 src/treelearner/cuda_tree_learner.cpp         |   9 +-
 src/treelearner/cuda_tree_learner.h           |   4 +-
 .../kernels/histogram_16_64_256.cu            | 707 +++++++++++++++++-
 .../kernels/histogram_16_64_256.hu            |  21 +-
 6 files changed, 877 insertions(+), 158 deletions(-)

diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
index d084abe4f23..4906ca7e02d 100644
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
@@ -1,13 +1,14 @@
-#ifdef USE_CUDA
-
-#include "cuda_kernel_launcher.h"
-#include <cuda_runtime.h>
-#include <cstdio>
-#include <LightGBM/utils/log.h>
-
-using namespace LightGBM;
-
-void cuda_histogram(
+   #ifdef USE_CUDA
+   
+   #include "cuda_kernel_launcher.h"
+   #include <cuda_runtime.h>
+   #include <cstdio>
+   #include <LightGBM/utils/log.h>
+   
+   using namespace LightGBM;
+   
+   void cuda_histogram(
+     		int		histogram_size,
                 data_size_t     leaf_num_data,
                 data_size_t     num_data,
                 bool            use_all_features,
@@ -26,141 +27,148 @@ void cuda_histogram(
                 volatile int*   arg8,
                 void*		arg9,
                 size_t          exp_workgroups_per_feature) {
-
-
- if (leaf_num_data == num_data) {
- 
-   if (use_all_features){
-     if (!is_constant_hessian) {
-       histogram256<<<num_workgroups, 256, 0, stream>>>(
-         arg0,
-         arg1,
-         arg2,
-         reinterpret_cast<const uint*>(arg3),
-         arg4,
-         arg5,
-         static_cast<float*>(arg6),
-         arg7,
-         arg8,
-         static_cast<acc_type*>(arg9),
-         exp_workgroups_per_feature
-       );
-     }
-     else {
-       histogram256<<<num_workgroups, 256, 0, stream>>>(
-         arg0,
-         arg1,
-         arg2,
-         reinterpret_cast<const uint*>(arg3),
-         arg4,
-         arg5,
-         arg6_const,
-         arg7,
-         arg8,
-         static_cast<acc_type*>(arg9),
-         exp_workgroups_per_feature
-       );
-     }
+   
+   if (histogram_size == 16) {
+      if (leaf_num_data == num_data) {
+         if (use_all_features) {
+            if (!is_constant_hessian) 
+               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else 
+               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+         else {   
+            if (!is_constant_hessian) 
+               histogram16_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else 
+               histogram16_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+      }
+      else {
+         if (use_all_features) {
+            // seems all features is always enabled, so this should be the same as fulldata
+            if (!is_constant_hessian) 
+               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else  
+               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+         else {
+            if (!is_constant_hessian) 
+               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else 
+               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+      }
    }
-   else{   
-   if (!is_constant_hessian) { 
-     histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(
-       arg0,
-       arg1,
-       arg2,
-       reinterpret_cast<const uint*>(arg3),
-       arg4,
-       arg5,
-       static_cast<float*>(arg6),
-       arg7,
-       arg8,
-       static_cast<acc_type*>(arg9),
-       exp_workgroups_per_feature);
-   }
-   else { 
-     histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(
-       arg0,
-       arg1,
-       arg2,
-       reinterpret_cast<const uint*>(arg3),
-       arg4,
-       arg5,
-       arg6_const, 
-       arg7,
-       arg8,
-       static_cast<acc_type*>(arg9),
-       exp_workgroups_per_feature);
-   }
-  }
- }
- else {
-   if (use_all_features) {
-     // seems all features is always enabled, so this should be the same as fulldata
-     if (!is_constant_hessian) { 
-
-       histogram256<<<num_workgroups, 256, 0, stream>>>(
-         arg0,
-         arg1,
-         arg2,
-         reinterpret_cast<const uint*>(arg3),
-         arg4,
-         arg5,
-         static_cast<float*>(arg6),
-         arg7,
-         arg8,
-         static_cast<acc_type*>(arg9),
-         exp_workgroups_per_feature
-       );
-     }
-     else { 
-       histogram256<<<num_workgroups, 256, 0, stream>>>(
-         arg0,
-         arg1,
-         arg2,
-         reinterpret_cast<const uint*>(arg3),
-         arg4,
-         arg5,
-         arg6_const, 
-         arg7,
-         arg8,
-         static_cast<acc_type*>(arg9),
-         exp_workgroups_per_feature
-       );
-     } 
+   else if (histogram_size == 64) {
+      if (leaf_num_data == num_data) {
+         if (use_all_features) {
+            if (!is_constant_hessian) 
+               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else 
+               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+         else {   
+            if (!is_constant_hessian) 
+               histogram64_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else 
+               histogram64_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+      }
+      else {
+         if (use_all_features) {
+            // seems all features is always enabled, so this should be the same as fulldata
+            if (!is_constant_hessian)
+               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else  
+               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+         else {
+            if (!is_constant_hessian) 
+               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else 
+               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+      }
    }
    else {
-     if (!is_constant_hessian) { 
-       histogram256<<<num_workgroups, 256, 0, stream>>>(
-         arg0,
-         arg1,
-         arg2,
-         reinterpret_cast<const uint*>(arg3),
-         arg4,
-         arg5,
-         static_cast<float*>(arg6),
-         arg7,
-         arg8,
-         static_cast<acc_type*>(arg9),
-         exp_workgroups_per_feature
-       );
-     }
-     else { 
-       histogram256<<<num_workgroups, 256, 0, stream>>>(
-         arg0,
-         arg1,
-         arg2,
-         reinterpret_cast<const uint*>(arg3),
-         arg4,
-         arg5,
-         arg6_const, 
-         arg7,
-         arg8,
-         static_cast<acc_type*>(arg9),
-         exp_workgroups_per_feature
-       );
-     }
+      if (leaf_num_data == num_data) {
+         if (use_all_features) {
+            if (!is_constant_hessian) 
+               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else 
+               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+         else {   
+            if (!is_constant_hessian) 
+               histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else 
+               histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+      }
+      else {
+         if (use_all_features) {
+            // seems all features is always enabled, so this should be the same as fulldata
+            if (!is_constant_hessian)
+               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else  
+               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+         else {
+            if (!is_constant_hessian) 
+               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+            else 
+               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+         }
+      }
    }
- }
 }
-
+     
 #endif // USE_CUDA
diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h
index f63cfa97a08..1241a9cafb9 100644
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
@@ -10,6 +10,7 @@ struct ThreadData {
           // device id
           int             device_id;
           // parameters for cuda_histogram
+          int             histogram_size;
           data_size_t     leaf_num_data;
           data_size_t     num_data;
           bool            use_all_features;
@@ -40,6 +41,7 @@ struct ThreadData {
 
 
 void cuda_histogram(
+		int		histogram_size,
 		data_size_t	leaf_num_data, 
 		data_size_t	num_data,
 		bool		use_all_features, 
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index e789462fe13..0e630389490 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -24,7 +24,8 @@ static void *launch_cuda_histogram(void *thread_data) {
   CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
 
   // launch cuda kernel
-  cuda_histogram(td.leaf_num_data, td.num_data, td.use_all_features,
+  cuda_histogram(td.histogram_size,
+                td.leaf_num_data, td.num_data, td.use_all_features,
                 td.is_constant_hessian, td.num_workgroups, td.stream,
                 td.device_features,
                 td.device_feature_masks,
@@ -183,7 +184,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
       cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_);
     }
     //set thread_data
-    SetThreadData(thread_data, device_id, leaf_num_data, use_all_features,
+    SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features,
                   num_workgroups, exp_workgroups_per_feature);
   }
  
@@ -479,17 +480,21 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
     max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
   }
 
+  // GCF XXX: resolving device_bin_size_ and histogram_size_ is the remaining work
   if (max_num_bin_ <= 16) {
     device_bin_size_ = 256; //LGBM_CUDA
+    histogram_size_ = 16;
     dword_features_ = 1; // LGBM_CUDA
   }
   else if (max_num_bin_ <= 64) {
     device_bin_size_ = 256; //LGBM_CUDA
+    histogram_size_ = 64;
     dword_features_ = 1; // LGBM_CUDA
   }
   else if ( max_num_bin_ <= 256) {
     Log::Debug("device_bin_size_ = 256");
     device_bin_size_ = 256;
+    histogram_size_ = 256;
     dword_features_ = 1; // LGBM_CUDA
   }
   else {
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 93bbdc483b7..a84d6b6662f 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -109,11 +109,12 @@ class CUDATreeLearner: public SerialTreeLearner {
   */
   void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
   
-  void SetThreadData(ThreadData* thread_data, int device_id,
+  void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
 		int leaf_num_data, bool use_all_features, 
 		int num_workgroups, int exp_workgroups_per_feature) {
     ThreadData* td = &thread_data[device_id];
     td->device_id		= device_id;
+    td->histogram_size		= histogram_size;
     td->leaf_num_data		= leaf_num_data;
     td->num_data		= num_data_;
     td->use_all_features	= use_all_features;
@@ -208,6 +209,7 @@ class CUDATreeLearner: public SerialTreeLearner {
    * which GPU kernel to use */
   int max_num_bin_;
   /*! \brief Used GPU kernel bin size (64, 256) */
+  int histogram_size_;
   int device_bin_size_;
   /*! \brief Size of histogram bin entry, depending if single or double precision is used */
   size_t hist_bin_entry_sz_;
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 08195d855de..33761a2c2a4 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -20,7 +20,703 @@ if (b == gtid && t == ltid) { \
   printf(fmt, __VA_ARGS__); \
 }
 
+// atomic add for float number in local memory
+inline __device__ void atomic_local_add_f(acc_type *addr, const float val)
+{
+    atomicAdd(addr, static_cast<acc_type>(val));
+}
+
+// histogram16 stuff
+#ifdef ENABLE_ALL_FEATURES
+#ifdef IGNORE_INDICES
+#define KERNEL_NAME histogram16_fulldata
+#else  // IGNORE_INDICES
+#define KERNEL_NAME histogram16 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
+//#define KERNEL_NAME histogram16_allfeats
+#endif // IGNORE_INDICES
+#else // ENABLE_ALL_FEATURES
+#error "ENABLE_ALL_FEATURES should always be 1"
+#define KERNEL_NAME histogram16
+#endif // ENABLE_ALL_FEATURES
+
+// this function will be called by histogram16
+// we have one sub-histogram of one feature in local memory, and need to read others
+inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist,
+                           const uint skip_id,
+                           const uint old_val_cont_bin0,
+                           const ushort num_sub_hist,
+                           acc_type* __restrict__ output_buf,
+                           acc_type* __restrict__ local_hist,
+                           const size_t power_feature_workgroups) {
+    const ushort ltid = threadIdx.x;
+    // TODO: try to avoid bank conflict here
+    acc_type grad_bin = local_hist[ltid * 2];
+    acc_type hess_bin = local_hist[ltid * 2 + 1];
+    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
+
+    uint cont_bin;
+    if (power_feature_workgroups != 0) {
+      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
+    } else {
+      cont_bin = local_cnt[ltid];
+    }
+    ushort i;
+
+    if (power_feature_workgroups != 0) {
+        // add all sub-histograms for feature
+        const acc_type* __restrict__ p = feature_sub_hist + ltid;
+        for (i = 0; i < skip_id; ++i) {
+            grad_bin += *p;          p += NUM_BINS;
+            hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+
+        // skip the counters we already have
+        p += 3 * NUM_BINS;  
+
+        for (i = i + 1; i < num_sub_hist; ++i) {
+            grad_bin += *p;          p += NUM_BINS;
+            hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+    }
+    __syncthreads();
+
+
+    output_buf[ltid * 3 + 0] = grad_bin;
+    output_buf[ltid * 3 + 1] = hess_bin;
+    output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); 
+}
+
+#if USE_CONSTANT_BUF == 1
+__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, 
+                      __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
+                      const data_size_t feature_size,
+                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
+                      const data_size_t num_data, 
+                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
+#if CONST_HESSIAN == 0
+                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
+#else
+                      const score_t const_hessian,
+#endif
+                      char* __restrict__ output_buf,
+                      volatile int * sync_counters,
+                      acc_type* __restrict__ hist_buf_base,
+                      const size_t power_feature_workgroups) {
+#else
+__global__ void KERNEL_NAME(const uchar* feature_data_base, 
+                      // FIXME: how to handle this __constant
+                      const uchar* __restrict__ feature_masks,
+                      const data_size_t feature_size,
+                      const data_size_t* data_indices, 
+                      const data_size_t num_data, 
+                      const score_t*  ordered_gradients, 
+#if CONST_HESSIAN == 0
+                      const score_t*  ordered_hessians,
+#else
+                      const score_t const_hessian,
+#endif
+                      char* __restrict__ output_buf, 
+                      volatile int * sync_counters,
+                      acc_type* __restrict__ hist_buf_base,
+                      const size_t power_feature_workgroups) {
+#endif
+     // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
+     // otherwise a "Misaligned Address" exception may occur
+     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
+     const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
+     const ushort ltid = threadIdx.x;
+     const ushort lsize = NUM_BINS; // get_local_size(0);
+     const ushort group_id = blockIdx.x;
+
+     // local memory per workgroup is 3 KB
+     // clear local memory
+     uint *ptr = (uint *) shared_array;
+     for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+         ptr[i] = 0;
+     }
+     __syncthreads();
+     // gradient/hessian histograms
+     // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
+     // total size: 2 * 256 * size_of(float) = 2 KB
+     // organization: each feature/grad/hessian is at a different bank,
+     //               as indepedent of the feature value as possible
+     acc_type *gh_hist = (acc_type *)shared_array;
+
+     // counter histogram
+     // total size: 256 * size_of(uint) = 1 KB
+     uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
+
+     // odd threads (1, 3, ...) compute histograms for hessians first
+     // even thread (0, 2, ...) compute histograms for gradients first
+     // etc.
+     uchar is_hessian_first = ltid & 1;
+
+     ushort feature_id = group_id >> power_feature_workgroups;
+
+     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
+     // feature_size is the number of examples per feature
+     const uchar *feature_data = feature_data_base + feature_id * feature_size;
+
+     // size of threads that process this feature4
+     const uint subglobal_size = lsize * (1 << power_feature_workgroups);
+
+     // equavalent thread ID in this subgroup for this feature4
+     const uint subglobal_tid  = gtid - feature_id * subglobal_size;
+
+
+     data_size_t ind;
+     data_size_t ind_next;
+     #ifdef IGNORE_INDICES
+     ind = subglobal_tid;
+     #else
+     ind = data_indices[subglobal_tid];
+     #endif
+
+     // extract feature mask, when a byte is set to 0, that feature is disabled
+     uchar feature_mask = feature_masks[feature_id];
+     // exit if the feature is masked
+     if (!feature_mask) {
+         return;
+     } else {
+         feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+     }
+
+     // STAGE 1: read feature data, and gradient and hessian
+     // first half of the threads read feature data from global memory
+     // We will prefetch data into the "next" variable at the beginning of each iteration
+     uchar feature;
+     uchar feature_next;
+     //uint8_t bin;
+     ushort bin;
+
+     feature = feature_data[ind >> feature_mask];
+     if (feature_mask) {
+        feature = (feature >> ((ind & 1) << 2)) & 0xf;
+     }
+     bin = feature;
+     acc_type grad_bin = 0.0f, hess_bin = 0.0f;
+     acc_type *addr_bin;
+
+     // store gradient and hessian
+     score_t grad, hess;
+     score_t grad_next, hess_next;
+     // LGBM_CUDA v5.1
+     grad = ordered_gradients[ind];
+     #if CONST_HESSIAN == 0
+     hess = ordered_hessians[ind];
+     #endif
+
+
+     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
+     for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+         // prefetch the next iteration variables
+         // we don't need bondary check because we have made the buffer large
+         #ifdef IGNORE_INDICES
+         // we need to check to bounds here
+         ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
+         #else
+         ind_next = data_indices[i + subglobal_size];
+         #endif
+
+         // imbGBT v5.1
+         grad_next = ordered_gradients[ind_next];
+         #if CONST_HESSIAN == 0
+         hess_next = ordered_hessians[ind_next];
+         #endif
+
+         // STAGE 2: accumulate gradient and hessian
+         if (bin != feature) {
+             addr_bin = gh_hist + bin * 2 + is_hessian_first;
+             #if CONST_HESSIAN == 0
+             acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
+             atomic_local_add_f(addr_bin, acc_bin);
+
+             addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+             acc_bin = is_hessian_first? grad_bin : hess_bin;             
+             atomic_local_add_f(addr_bin, acc_bin);
+
+             #elif CONST_HESSIAN == 1
+             atomic_local_add_f(addr_bin, grad_bin);
+             #endif
+
+             bin = feature;
+             grad_bin = grad;
+             hess_bin = hess;
+         }
+         else {
+             grad_bin += grad;
+             hess_bin += hess;
+         }
+
+         // prefetch the next iteration variables
+         feature_next = feature_data[ind_next >> feature_mask];
+
+         // STAGE 3: accumulate counter
+         atomicAdd(cnt_hist + feature, 1);
+
+         // STAGE 4: update next stat
+         grad = grad_next;
+         hess = hess_next;
+         // LGBM_CUDA: v4.2
+         if (!feature_mask) {
+             feature = feature_next;
+         } else {
+             feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
+         }
+     }
+
+
+     addr_bin = gh_hist + bin * 2 + is_hessian_first;
+     #if CONST_HESSIAN == 0
+     acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
+     atomic_local_add_f(addr_bin, acc_bin);
+
+     addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+     acc_bin = is_hessian_first? grad_bin : hess_bin;
+     atomic_local_add_f(addr_bin, acc_bin);
+
+     #elif CONST_HESSIAN == 1
+     atomic_local_add_f(addr_bin, grad_bin);
+     #endif
+     __syncthreads();
+
+     #if CONST_HESSIAN == 1
+     // make a final reduction
+     gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
+     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position 
+     __syncthreads();
+     #endif
+
+#if POWER_FEATURE_WORKGROUPS != 0
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
+     // write gradients and hessians
+     acc_type *__restrict__ ptr_f = output;
+     for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+         // even threads read gradients, odd threads read hessians
+         // FIXME: 2-way bank conflict
+         acc_type value = gh_hist[i];
+         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
+     }
+     // write counts
+     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
+     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+         // FIXME: 2-way bank conflict
+         uint value = cnt_hist[i];
+         ptr_i[i] = value;
+     }
+     // FIXME: is this right
+     __syncthreads();
+     __threadfence();
+     // To avoid the cost of an extra reducting kernel, we have to deal with some
+     // gray area in OpenCL. We want the last work group that process this feature to
+     // make the final reduction, and other threads will just quit.
+     // This requires that the results written by other workgroups available to the
+     // last workgroup (memory consistency)
+     #if NVIDIA == 1
+     // this is equavalent to CUDA __threadfence();
+     // ensure the writes above goes to main memory and other workgroups can see it
+     asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
+     #else
+     // FIXME: how to do the above on AMD GPUs??
+     // GCN ISA says that the all writes will bypass L1 cache (write through),
+     // however when the last thread is reading sub-histogram data we have to
+     // make sure that no part of data is modified in local L1 cache of other workgroups.
+     // Otherwise reading can be a problem (atomic operations to get consistency).
+     // But in our case, the sub-histogram of this workgroup cannot be in the cache
+     // of another workgroup, so the following trick will work just fine.
+     #endif
+     // Now, we want one workgroup to do the final reduction.
+     // Other workgroups processing the same feature quit.
+     // The is done by using an global atomic counter.
+     // On AMD GPUs ideally this should be done in GDS,
+     // but currently there is no easy way to access it via OpenCL.
+     uint * counter_val = cnt_hist;     
+     // backup the old value
+     uint old_val = *counter_val;
+     if (ltid == 0) {
+         // all workgroups processing the same feature add this counter
+         *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
+     }
+     // make sure everyone in this workgroup is here
+     __syncthreads();
+     // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+     if (*counter_val == (1 << power_feature_workgroups) - 1) {
+         if (ltid == 0) {
+             sync_counters[feature_id] = 0;
+         }
+     //}
+ #else
+     }
+     // only 1 work group, no need to increase counter
+     // the reduction will become a simple copy
+     if (1) {
+         uint old_val; // dummy
+ #endif
+         // locate our feature's block in output memory
+         uint output_offset = (feature_id << power_feature_workgroups);
+         acc_type const * __restrict__ feature_subhists =
+                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
+         // skip reading the data already in local memory
+         //uint skip_id = feature_id ^ output_offset;
+         uint skip_id = group_id - output_offset;
+         // locate output histogram location for this feature4
+         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS;
+
+         
+         within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
+     }
+}
+
+// end of histogram16 stuff
+
+// histogram64 stuff
+#ifdef ENABLE_ALL_FEATURES
+#ifdef IGNORE_INDICES
+#define KERNEL_NAME histogram64_fulldata
+#else  // IGNORE_INDICES
+#define KERNEL_NAME histogram64 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
+//#define KERNEL_NAME histogram64_allfeats
+#endif // IGNORE_INDICES
+#else // ENABLE_ALL_FEATURES
+#error "ENABLE_ALL_FEATURES should always be 1"
+#define KERNEL_NAME histogram64
+#endif // ENABLE_ALL_FEATURES
+
+// this function will be called by histogram64
+// we have one sub-histogram of one feature in local memory, and need to read others
+inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist,
+                           const uint skip_id,
+                           const uint old_val_cont_bin0,
+                           const ushort num_sub_hist,
+                           acc_type* __restrict__ output_buf,
+                           acc_type* __restrict__ local_hist,
+                           const size_t power_feature_workgroups) {
+    const ushort ltid = threadIdx.x;
+    // TODO: try to avoid bank conflict here
+    acc_type grad_bin = local_hist[ltid * 2];
+    acc_type hess_bin = local_hist[ltid * 2 + 1];
+    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
+
+    uint cont_bin;
+    if (power_feature_workgroups != 0) {
+      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
+    } else {
+      cont_bin = local_cnt[ltid];
+    }
+    ushort i;
+
+    if (power_feature_workgroups != 0) {
+        // add all sub-histograms for feature
+        const acc_type* __restrict__ p = feature_sub_hist + ltid;
+        for (i = 0; i < skip_id; ++i) {
+            grad_bin += *p;          p += NUM_BINS;
+            hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+
+        // skip the counters we already have
+        p += 3 * NUM_BINS;  
+
+        for (i = i + 1; i < num_sub_hist; ++i) {
+            grad_bin += *p;          p += NUM_BINS;
+            hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+    }
+    __syncthreads();
+
+
+    output_buf[ltid * 3 + 0] = grad_bin;
+    output_buf[ltid * 3 + 1] = hess_bin;
+    output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); 
+}
+
+#if USE_CONSTANT_BUF == 1
+__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, 
+                      __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
+                      const data_size_t feature_size,
+                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
+                      const data_size_t num_data, 
+                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
+#if CONST_HESSIAN == 0
+                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
+#else
+                      const score_t const_hessian,
+#endif
+                      char* __restrict__ output_buf,
+                      volatile int * sync_counters,
+                      acc_type* __restrict__ hist_buf_base,
+                      const size_t power_feature_workgroups) {
+#else
+__global__ void KERNEL_NAME(const uchar* feature_data_base, 
+                      // FIXME: how to handle this __constant
+                      const uchar* __restrict__ feature_masks,
+                      const data_size_t feature_size,
+                      const data_size_t* data_indices, 
+                      const data_size_t num_data, 
+                      const score_t*  ordered_gradients, 
+#if CONST_HESSIAN == 0
+                      const score_t*  ordered_hessians,
+#else
+                      const score_t const_hessian,
+#endif
+                      char* __restrict__ output_buf, 
+                      volatile int * sync_counters,
+                      acc_type* __restrict__ hist_buf_base,
+                      const size_t power_feature_workgroups) {
+#endif
+     // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
+     // otherwise a "Misaligned Address" exception may occur
+     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
+     const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
+     const ushort ltid = threadIdx.x;
+     const ushort lsize = NUM_BINS; // get_local_size(0);
+     const ushort group_id = blockIdx.x;
+
+     // local memory per workgroup is 3 KB
+     // clear local memory
+     uint *ptr = (uint *) shared_array;
+     for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+         ptr[i] = 0;
+     }
+     __syncthreads();
+     // gradient/hessian histograms
+     // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
+     // total size: 2 * 256 * size_of(float) = 2 KB
+     // organization: each feature/grad/hessian is at a different bank,
+     //               as indepedent of the feature value as possible
+     acc_type *gh_hist = (acc_type *)shared_array;
+
+     // counter histogram
+     // total size: 256 * size_of(uint) = 1 KB
+     uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
+
+     // odd threads (1, 3, ...) compute histograms for hessians first
+     // even thread (0, 2, ...) compute histograms for gradients first
+     // etc.
+     uchar is_hessian_first = ltid & 1;
+
+     ushort feature_id = group_id >> power_feature_workgroups;
+
+     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
+     // feature_size is the number of examples per feature
+     const uchar *feature_data = feature_data_base + feature_id * feature_size;
+
+     // size of threads that process this feature4
+     const uint subglobal_size = lsize * (1 << power_feature_workgroups);
+
+     // equavalent thread ID in this subgroup for this feature4
+     const uint subglobal_tid  = gtid - feature_id * subglobal_size;
+
+
+     data_size_t ind;
+     data_size_t ind_next;
+     #ifdef IGNORE_INDICES
+     ind = subglobal_tid;
+     #else
+     ind = data_indices[subglobal_tid];
+     #endif
+
+     // extract feature mask, when a byte is set to 0, that feature is disabled
+     uchar feature_mask = feature_masks[feature_id];
+     // exit if the feature is masked
+     if (!feature_mask) {
+         return;
+     } else {
+         feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+     }
+
+     // STAGE 1: read feature data, and gradient and hessian
+     // first half of the threads read feature data from global memory
+     // We will prefetch data into the "next" variable at the beginning of each iteration
+     uchar feature;
+     uchar feature_next;
+     //uint8_t bin;
+     ushort bin;
+
+     feature = feature_data[ind >> feature_mask];
+     if (feature_mask) {
+        feature = (feature >> ((ind & 1) << 2)) & 0xf;
+     }
+     bin = feature;
+     acc_type grad_bin = 0.0f, hess_bin = 0.0f;
+     acc_type *addr_bin;
+
+     // store gradient and hessian
+     score_t grad, hess;
+     score_t grad_next, hess_next;
+     // LGBM_CUDA v5.1
+     grad = ordered_gradients[ind];
+     #if CONST_HESSIAN == 0
+     hess = ordered_hessians[ind];
+     #endif
+
+
+     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
+     for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+         // prefetch the next iteration variables
+         // we don't need bondary check because we have made the buffer large
+         #ifdef IGNORE_INDICES
+         // we need to check to bounds here
+         ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
+         #else
+         ind_next = data_indices[i + subglobal_size];
+         #endif
+
+         // imbGBT v5.1
+         grad_next = ordered_gradients[ind_next];
+         #if CONST_HESSIAN == 0
+         hess_next = ordered_hessians[ind_next];
+         #endif
+
+         // STAGE 2: accumulate gradient and hessian
+         if (bin != feature) {
+             addr_bin = gh_hist + bin * 2 + is_hessian_first;
+             #if CONST_HESSIAN == 0
+             acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
+             atomic_local_add_f(addr_bin, acc_bin);
+
+             addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+             acc_bin = is_hessian_first? grad_bin : hess_bin;             
+             atomic_local_add_f(addr_bin, acc_bin);
+
+             #elif CONST_HESSIAN == 1
+             atomic_local_add_f(addr_bin, grad_bin);
+             #endif
+
+             bin = feature;
+             grad_bin = grad;
+             hess_bin = hess;
+         }
+         else {
+             grad_bin += grad;
+             hess_bin += hess;
+         }
+
+         // prefetch the next iteration variables
+         feature_next = feature_data[ind_next >> feature_mask];
+
+         // STAGE 3: accumulate counter
+         atomicAdd(cnt_hist + feature, 1);
+
+         // STAGE 4: update next stat
+         grad = grad_next;
+         hess = hess_next;
+         // LGBM_CUDA: v4.2
+         if (!feature_mask) {
+             feature = feature_next;
+         } else {
+             feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
+         }
+     }
+
+
+     addr_bin = gh_hist + bin * 2 + is_hessian_first;
+     #if CONST_HESSIAN == 0
+     acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
+     atomic_local_add_f(addr_bin, acc_bin);
+
+     addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+     acc_bin = is_hessian_first? grad_bin : hess_bin;
+     atomic_local_add_f(addr_bin, acc_bin);
+
+     #elif CONST_HESSIAN == 1
+     atomic_local_add_f(addr_bin, grad_bin);
+     #endif
+     __syncthreads();
+
+     #if CONST_HESSIAN == 1
+     // make a final reduction
+     gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
+     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position 
+     __syncthreads();
+     #endif
+
+#if POWER_FEATURE_WORKGROUPS != 0
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
+     // write gradients and hessians
+     acc_type *__restrict__ ptr_f = output;
+     for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+         // even threads read gradients, odd threads read hessians
+         // FIXME: 2-way bank conflict
+         acc_type value = gh_hist[i];
+         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
+     }
+     // write counts
+     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
+     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+         // FIXME: 2-way bank conflict
+         uint value = cnt_hist[i];
+         ptr_i[i] = value;
+     }
+     // FIXME: is this right
+     __syncthreads();
+     __threadfence();
+     // To avoid the cost of an extra reducting kernel, we have to deal with some
+     // gray area in OpenCL. We want the last work group that process this feature to
+     // make the final reduction, and other threads will just quit.
+     // This requires that the results written by other workgroups available to the
+     // last workgroup (memory consistency)
+     #if NVIDIA == 1
+     // this is equavalent to CUDA __threadfence();
+     // ensure the writes above goes to main memory and other workgroups can see it
+     asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
+     #else
+     // FIXME: how to do the above on AMD GPUs??
+     // GCN ISA says that the all writes will bypass L1 cache (write through),
+     // however when the last thread is reading sub-histogram data we have to
+     // make sure that no part of data is modified in local L1 cache of other workgroups.
+     // Otherwise reading can be a problem (atomic operations to get consistency).
+     // But in our case, the sub-histogram of this workgroup cannot be in the cache
+     // of another workgroup, so the following trick will work just fine.
+     #endif
+     // Now, we want one workgroup to do the final reduction.
+     // Other workgroups processing the same feature quit.
+     // The is done by using an global atomic counter.
+     // On AMD GPUs ideally this should be done in GDS,
+     // but currently there is no easy way to access it via OpenCL.
+     uint * counter_val = cnt_hist;     
+     // backup the old value
+     uint old_val = *counter_val;
+     if (ltid == 0) {
+         // all workgroups processing the same feature add this counter
+         *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
+     }
+     // make sure everyone in this workgroup is here
+     __syncthreads();
+     // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+     if (*counter_val == (1 << power_feature_workgroups) - 1) {
+         if (ltid == 0) {
+             sync_counters[feature_id] = 0;
+         }
+     //}
+ #else
+     }
+     // only 1 work group, no need to increase counter
+     // the reduction will become a simple copy
+     if (1) {
+         uint old_val; // dummy
+ #endif
+         // locate our feature's block in output memory
+         uint output_offset = (feature_id << power_feature_workgroups);
+         acc_type const * __restrict__ feature_subhists =
+                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
+         // skip reading the data already in local memory
+         //uint skip_id = feature_id ^ output_offset;
+         uint skip_id = group_id - output_offset;
+         // locate output histogram location for this feature4
+         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS;
+
+         
+         within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
+     }
+}
+
+// end of histogram64 stuff
 
+// histogram256 stuff
 #ifdef ENABLE_ALL_FEATURES
 #ifdef IGNORE_INDICES
 #define KERNEL_NAME histogram256_fulldata
@@ -33,13 +729,6 @@ if (b == gtid && t == ltid) { \
 #define KERNEL_NAME histogram256
 #endif // ENABLE_ALL_FEATURES
 
-
-// atomic add for float number in local memory
-inline __device__ void atomic_local_add_f(acc_type *addr, const float val)
-{
-    atomicAdd(addr, static_cast<acc_type>(val));
-}
-
 // this function will be called by histogram256
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist,
@@ -128,7 +817,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
      const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
      const ushort ltid = threadIdx.x;
-     const ushort lsize = LOCAL_SIZE_0; // get_local_size(0);
+     const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
      // local memory per workgroup is 3 KB
@@ -370,3 +1059,5 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      }
 }
 
+// end of histogram256 stuff
+
diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
index 145a85367f0..eff3e51c460 100644
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ b/src/treelearner/kernels/histogram_16_64_256.hu
@@ -12,8 +12,8 @@
  * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  */
 
-#ifndef _HISTOGRAM_256_KERNEL_
-#define _HISTOGRAM_256_KERNEL_
+#ifndef _HISTOGRAM_16_64_256_KERNEL_
+#define _HISTOGRAM_16_64_256_KERNEL_
 
 //#pragma once
 
@@ -65,8 +65,6 @@ __device__ uchar4 as_uchar4(const T t) {
   return u;
 }
 
-
-#define LOCAL_SIZE_0 256
 #define NUM_BINS 256
 #if USE_DP_FLOAT == 1
 typedef double acc_type;
@@ -79,7 +77,6 @@ typedef uint acc_int_type;
 #define as_acc_type as_float
 #define as_acc_int_type as_uint
 #endif
-//#define LOCAL_MEM_SIZE (4 * (sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
 #define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
 
 // unroll the atomic operation for a few times. Takes more code space, 
@@ -169,6 +166,20 @@ __global__ void name(const uchar* feature_data_base, \
                      const size_t power_feature_workgroups);
 
 
+DECLARE_CONST_HES(histogram16_allfeats);
+DECLARE_CONST_HES(histogram16_fulldata);
+DECLARE_CONST_HES(histogram16);
+DECLARE(histogram16_allfeats);
+DECLARE(histogram16_fulldata);
+DECLARE(histogram16);
+
+DECLARE_CONST_HES(histogram64_allfeats);
+DECLARE_CONST_HES(histogram64_fulldata);
+DECLARE_CONST_HES(histogram64);
+DECLARE(histogram64_allfeats);
+DECLARE(histogram64_fulldata);
+DECLARE(histogram64);
+
 DECLARE_CONST_HES(histogram256_allfeats);
 DECLARE_CONST_HES(histogram256_fulldata);
 DECLARE_CONST_HES(histogram256);

From 29f6979b40aee2e7700a38eef9de1a7c13679d27 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Tue, 21 Apr 2020 21:35:43 +0000
Subject: [PATCH 024/119] Initial CUDA work

---
 src/treelearner/kernels/histogram_16_64_256.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 33761a2c2a4..020ca5453af 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -372,6 +372,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 // end of histogram16 stuff
 
 // histogram64 stuff
+#undef KERNEL_NAME
 #ifdef ENABLE_ALL_FEATURES
 #ifdef IGNORE_INDICES
 #define KERNEL_NAME histogram64_fulldata
@@ -717,6 +718,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 // end of histogram64 stuff
 
 // histogram256 stuff
+#undef KERNEL_NAME
 #ifdef ENABLE_ALL_FEATURES
 #ifdef IGNORE_INDICES
 #define KERNEL_NAME histogram256_fulldata

From 4d89fd7534023b4fd25377409709473f949ff39c Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 22 Apr 2020 18:00:07 +0000
Subject: [PATCH 025/119] Initial CUDA work

---
 tests/python_package_test/test_consistency.py | 4 ++++
 tests/python_package_test/test_plotting.py    | 2 ++
 tests/python_package_test/test_sklearn.py     | 6 ++++++
 3 files changed, 12 insertions(+)

diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py
index 63a5834cf61..f6e955ee48d 100644
--- a/tests/python_package_test/test_consistency.py
+++ b/tests/python_package_test/test_consistency.py
@@ -68,6 +68,8 @@ class TestEngine(unittest.TestCase):
 
     def test_binary(self):
         fd = FileLoader('../../examples/binary_classification', 'binary')
+        if lgb.get_device_type() == 2:
+            fd.params["device"] = "cuda"
         X_train, y_train, _ = fd.load_dataset('.train')
         X_test, _, X_test_fn = fd.load_dataset('.test')
         weight_train = fd.load_field('.train.weight')
@@ -91,6 +93,8 @@ def test_multiclass(self):
 
     def test_regression(self):
         fd = FileLoader('../../examples/regression', 'regression')
+        if lgb.get_device_type() == 2:
+            fd.params["device"] = "cuda"
         X_train, y_train, _ = fd.load_dataset('.train')
         X_test, _, X_test_fn = fd.load_dataset('.test')
         init_score_train = fd.load_field('.train.init')
diff --git a/tests/python_package_test/test_plotting.py b/tests/python_package_test/test_plotting.py
index 72915914fe1..13ba9859d97 100644
--- a/tests/python_package_test/test_plotting.py
+++ b/tests/python_package_test/test_plotting.py
@@ -24,6 +24,8 @@ def setUp(self):
             "verbose": -1,
             "num_leaves": 3
         }
+        if lgb.get_device_type() == 2:
+            self.params["device"] = "cuda"
 
     @unittest.skipIf(not MATPLOTLIB_INSTALLED, 'matplotlib is not installed')
     def test_plot_importance(self):
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index cd50805a70b..350f3c8f486 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -453,6 +453,8 @@ def test_evaluate_train_set(self):
     def test_metrics(self):
         X, y = load_boston(True)
         params = {'n_estimators': 2, 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False}
 
         # no custom objective, no custom metric
@@ -709,6 +711,8 @@ def test_inf_handle(self):
         y = np.random.randn(nrows) + np.full(nrows, 1e30)
         weight = np.full(nrows, 1e10)
         params = {'n_estimators': 20, 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y),
                       'verbose': False, 'early_stopping_rounds': 5}
         gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
@@ -721,6 +725,8 @@ def test_nan_handle(self):
         y = np.random.randn(nrows) + np.full(nrows, 1e30)
         weight = np.zeros(nrows)
         params = {'n_estimators': 20, 'verbose': -1}
+        if lgb.get_device_type() == 2:
+            params["device"] = "cuda"
         params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y),
                       'verbose': False, 'early_stopping_rounds': 5}
         gbm = lgb.LGBMRegressor(**params).fit(**params_fit)

From e10a46784136099c2400f8957a3b95adf2984ebc Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 22 Apr 2020 18:03:14 +0000
Subject: [PATCH 026/119] Initial CUDA work

---
 include/LightGBM/cuda/vector_cudahost.h | 1 +
 src/boosting/gbdt.cpp                   | 8 ++++----
 src/treelearner/cuda_tree_learner.cpp   | 4 ++--
 src/treelearner/serial_tree_learner.cpp | 2 ++
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index d95a353c246..61d6e464970 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -58,6 +58,7 @@ fprintf(stderr, "   TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(
 
  void deallocate(T* p, std::size_t n)
  {
+    (void)n;  // UNUSED
     if (p==NULL) return;
     #ifdef USE_CUDA
       if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index b2fb636ac4e..55e11312235 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -385,7 +385,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
  // LGBM_CUDA invoke baggging during the first iteration
  if ((config_->device_type == std::string("cuda")) && (iter_ == 0)) {
 
-    auto start_time = std::chrono::steady_clock::now();
+//    auto start_time = std::chrono::steady_clock::now();
 
     Bagging(0); 
   }
@@ -399,7 +399,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
     }
 
     // LGBM_CUDA
-    auto start_time = std::chrono::steady_clock::now();
+//    auto start_time = std::chrono::steady_clock::now();
 
     Boosting();
 
@@ -414,7 +414,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
   for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
 
     // LGBM_CUDA
-    auto start_time = std::chrono::steady_clock::now();
+//    auto start_time = std::chrono::steady_clock::now();
 
     const size_t offset = static_cast<size_t>(cur_tree_id) * num_data_;
     std::unique_ptr<Tree> new_tree(new Tree(2));
@@ -484,7 +484,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
     int iter_next = iter_ + 1;
       if (iter_next < config_->num_iterations) {
 
-       auto start_time = std::chrono::steady_clock::now();
+//       auto start_time = std::chrono::steady_clock::now();
 
        // bagging logic
        Bagging(iter_next);
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 0e630389490..aa061bc289f 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -225,7 +225,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) {
   //#pragma omp parallel for schedule(static, num_gpu_)
   for(int device_id = 0; device_id < num_gpu_; ++device_id) {
 
-    auto start_time = std::chrono::steady_clock::now();
+//    auto start_time = std::chrono::steady_clock::now();
 
     // when the output is ready, the computation is done
     CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id]));
@@ -865,7 +865,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
 void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
 
   // LGBM_CUDA
-  auto start_time = std::chrono::steady_clock::now();
+//  auto start_time = std::chrono::steady_clock::now();
 
   std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
   std::vector<int8_t> is_dense_feature_used(num_features_, 0);
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 5d2b9afff50..3c9faa84ff5 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -27,6 +27,7 @@ SerialTreeLearner::~SerialTreeLearner() {
 
 //LGBM_CUDA
 void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
+  (void)is_use_subset;	// UNUSED
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   num_features_ = train_data_->num_features();
@@ -444,6 +445,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
 int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
                                        int* right_leaf, int *cur_depth,
                                        bool *aborted_last_force_split) {
+  (void)aborted_last_force_split;
   bool abort_last_forced_split = false;
   if (forced_split_json_ == nullptr) {
     return 0;

From 911c1b398da1d2c6f3688fe763121b1b22ce0b93 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Wed, 22 Apr 2020 21:31:20 +0000
Subject: [PATCH 027/119] Initial CUDA work

---
 src/treelearner/cuda_kernel_launcher.cu       | 32 +++++++++----------
 src/treelearner/cuda_tree_learner.cpp         |  4 +--
 .../kernels/histogram_16_64_256.cu            | 16 ++++++++++
 .../kernels/histogram_16_64_256.hu            |  2 --
 4 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
index 4906ca7e02d..dad8b6c563b 100644
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
@@ -32,21 +32,21 @@
       if (leaf_num_data == num_data) {
          if (use_all_features) {
             if (!is_constant_hessian) 
-               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
             else 
-               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
          }
          else {   
             if (!is_constant_hessian) 
-               histogram16_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
             else 
-               histogram16_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
          }
@@ -55,21 +55,21 @@
          if (use_all_features) {
             // seems all features is always enabled, so this should be the same as fulldata
             if (!is_constant_hessian) 
-               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
             else  
-               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
          }
          else {
             if (!is_constant_hessian) 
-               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
             else 
-               histogram16<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
          }
@@ -79,21 +79,21 @@
       if (leaf_num_data == num_data) {
          if (use_all_features) {
             if (!is_constant_hessian) 
-               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
             else 
-               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
          }
          else {   
             if (!is_constant_hessian) 
-               histogram64_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
             else 
-               histogram64_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
          }
@@ -102,21 +102,21 @@
          if (use_all_features) {
             // seems all features is always enabled, so this should be the same as fulldata
             if (!is_constant_hessian)
-               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
             else  
-               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
          }
          else {
             if (!is_constant_hessian) 
-               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
             else 
-               histogram64<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
          }
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index aa061bc289f..50d0541ffe9 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -482,12 +482,12 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
 
   // GCF XXX: resolving device_bin_size_ and histogram_size_ is the remaining work
   if (max_num_bin_ <= 16) {
-    device_bin_size_ = 256; //LGBM_CUDA
+    device_bin_size_ = 16; //LGBM_CUDA
     histogram_size_ = 16;
     dword_features_ = 1; // LGBM_CUDA
   }
   else if (max_num_bin_ <= 64) {
-    device_bin_size_ = 256; //LGBM_CUDA
+    device_bin_size_ = 64; //LGBM_CUDA
     histogram_size_ = 64;
     dword_features_ = 1; // LGBM_CUDA
   }
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 020ca5453af..a02bc1dd79f 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -38,6 +38,8 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const float val)
 #error "ENABLE_ALL_FEATURES should always be 1"
 #define KERNEL_NAME histogram16
 #endif // ENABLE_ALL_FEATURES
+#define NUM_BINS 16
+#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
 
 // this function will be called by histogram16
 // we have one sub-histogram of one feature in local memory, and need to read others
@@ -130,6 +132,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
+//if (gtid == 0) printf("Entering the 16-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); 
+
      // local memory per workgroup is 3 KB
      // clear local memory
      uint *ptr = (uint *) shared_array;
@@ -373,6 +377,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 
 // histogram64 stuff
 #undef KERNEL_NAME
+#undef NUM_BINS
+#undef LOCAL_MEM_SIZE
 #ifdef ENABLE_ALL_FEATURES
 #ifdef IGNORE_INDICES
 #define KERNEL_NAME histogram64_fulldata
@@ -384,6 +390,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 #error "ENABLE_ALL_FEATURES should always be 1"
 #define KERNEL_NAME histogram64
 #endif // ENABLE_ALL_FEATURES
+#define NUM_BINS 64
+#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
 
 // this function will be called by histogram64
 // we have one sub-histogram of one feature in local memory, and need to read others
@@ -476,6 +484,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
+//if (gtid == 0) printf("Entering the 64-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); 
+
      // local memory per workgroup is 3 KB
      // clear local memory
      uint *ptr = (uint *) shared_array;
@@ -719,6 +729,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 
 // histogram256 stuff
 #undef KERNEL_NAME
+#undef NUM_BINS
+#undef LOCAL_MEM_SIZE
 #ifdef ENABLE_ALL_FEATURES
 #ifdef IGNORE_INDICES
 #define KERNEL_NAME histogram256_fulldata
@@ -730,6 +742,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 #error "ENABLE_ALL_FEATURES should always be 1"
 #define KERNEL_NAME histogram256
 #endif // ENABLE_ALL_FEATURES
+#define NUM_BINS 256
+#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
 
 // this function will be called by histogram256
 // we have one sub-histogram of one feature in local memory, and need to read others
@@ -822,6 +836,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
+//if (gtid == 0) printf("Entering the 256-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); 
+
      // local memory per workgroup is 3 KB
      // clear local memory
      uint *ptr = (uint *) shared_array;
diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
index eff3e51c460..1a875588cc4 100644
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ b/src/treelearner/kernels/histogram_16_64_256.hu
@@ -65,7 +65,6 @@ __device__ uchar4 as_uchar4(const T t) {
   return u;
 }
 
-#define NUM_BINS 256
 #if USE_DP_FLOAT == 1
 typedef double acc_type;
 typedef ulong acc_int_type;
@@ -77,7 +76,6 @@ typedef uint acc_int_type;
 #define as_acc_type as_float
 #define as_acc_int_type as_uint
 #endif
-#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
 
 // unroll the atomic operation for a few times. Takes more code space, 
 // but compiler can generate better code for faster atomics.

From 18704804f9f90793731296ba9c97ce960c692406 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Fri, 24 Apr 2020 23:41:52 +0000
Subject: [PATCH 028/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp         |  4 +-
 .../kernels/histogram_16_64_256.cu            | 45 ++++++++-----------
 2 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 50d0541ffe9..2c7114aed0a 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -188,7 +188,6 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
                   num_workgroups, exp_workgroups_per_feature);
   }
  
-
   for(int device_id = 0; device_id < num_gpu_; ++device_id) {
     if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))){
         fprintf(stderr, "Error in creating threads. Exiting\n");
@@ -238,7 +237,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) {
       continue;
     }
     int dense_group_index = dense_feature_group_map_[i];
-    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
+    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
     int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); 
 
     for (int j = 0; j < bin_size; ++j) {
@@ -480,7 +479,6 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
     max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
   }
 
-  // GCF XXX: resolving device_bin_size_ and histogram_size_ is the remaining work
   if (max_num_bin_ <= 16) {
     device_bin_size_ = 16; //LGBM_CUDA
     histogram_size_ = 16;
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index a02bc1dd79f..d8d1b626c8f 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -74,7 +74,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
         }
 
         // skip the counters we already have
-        p += 3 * NUM_BINS;  
+        p += 2 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
@@ -85,9 +85,8 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
     __syncthreads();
 
 
-    output_buf[ltid * 3 + 0] = grad_bin;
-    output_buf[ltid * 3 + 1] = hess_bin;
-    output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); 
+    output_buf[ltid * 2 + 0] = grad_bin;
+    output_buf[ltid * 2 + 1] = hess_bin;
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -132,8 +131,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
-//if (gtid == 0) printf("Entering the 16-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); 
-
      // local memory per workgroup is 3 KB
      // clear local memory
      uint *ptr = (uint *) shared_array;
@@ -294,7 +291,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -361,12 +358,12 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;
          // locate output histogram location for this feature4
-         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS;
+         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
          
          within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
@@ -426,7 +423,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
         }
 
         // skip the counters we already have
-        p += 3 * NUM_BINS;  
+        p += 2 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
@@ -437,9 +434,8 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
     __syncthreads();
 
 
-    output_buf[ltid * 3 + 0] = grad_bin;
-    output_buf[ltid * 3 + 1] = hess_bin;
-    output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); 
+    output_buf[ltid * 2 + 0] = grad_bin;
+    output_buf[ltid * 2 + 1] = hess_bin;
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -484,8 +480,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
-//if (gtid == 0) printf("Entering the 64-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); 
-
      // local memory per workgroup is 3 KB
      // clear local memory
      uint *ptr = (uint *) shared_array;
@@ -646,7 +640,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -713,12 +707,12 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;
          // locate output histogram location for this feature4
-         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS;
+         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
          
          within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
@@ -778,7 +772,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         }
 
         // skip the counters we already have
-        p += 3 * NUM_BINS;  
+        p += 2 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
@@ -789,9 +783,8 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     __syncthreads();
 
 
-    output_buf[ltid * 3 + 0] = grad_bin;
-    output_buf[ltid * 3 + 1] = hess_bin;
-    output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); 
+    output_buf[ltid * 2 + 0] = grad_bin;
+    output_buf[ltid * 2 + 1] = hess_bin;
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -836,8 +829,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
-//if (gtid == 0) printf("Entering the 256-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); 
-
      // local memory per workgroup is 3 KB
      // clear local memory
      uint *ptr = (uint *) shared_array;
@@ -998,7 +989,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -1065,12 +1056,12 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;
          // locate output histogram location for this feature4
-         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS;
+         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
          
          within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);

From 37a1a61d9ddad4bc547d51fe69d0447cf3d9311e Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 27 Apr 2020 01:37:01 +0000
Subject: [PATCH 029/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 2c7114aed0a..f1e7241a52f 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -237,7 +237,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) {
       continue;
     }
     int dense_group_index = dense_feature_group_map_[i];
-    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
+    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
     int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); 
 
     for (int j = 0; j < bin_size; ++j) {

From fcf031cac8870432054ad8e67491bab6e94034a7 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 27 Apr 2020 02:26:52 +0000
Subject: [PATCH 030/119] Initial CUDA work

---
 .../kernels/histogram_16_64_256.cu            | 45 ++-----------------
 1 file changed, 3 insertions(+), 42 deletions(-)

diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index d8d1b626c8f..bb8bed2db79 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -45,7 +45,6 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const float val)
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist,
                            const uint skip_id,
-                           const uint old_val_cont_bin0,
                            const ushort num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
@@ -54,14 +53,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
 
-    uint cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -70,7 +62,6 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
@@ -79,12 +70,10 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
-
     output_buf[ltid * 2 + 0] = grad_bin;
     output_buf[ltid * 2 + 1] = hess_bin;
 }
@@ -335,7 +324,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // but currently there is no easy way to access it via OpenCL.
      uint * counter_val = cnt_hist;     
      // backup the old value
-     uint old_val = *counter_val;
      if (ltid == 0) {
          // all workgroups processing the same feature add this counter
          *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
@@ -353,7 +341,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // only 1 work group, no need to increase counter
      // the reduction will become a simple copy
      if (1) {
-         uint old_val; // dummy
  #endif
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
@@ -366,7 +353,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
          
-         within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
+         within_kernel_reduction16x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
      }
 }
 
@@ -394,7 +381,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist,
                            const uint skip_id,
-                           const uint old_val_cont_bin0,
                            const ushort num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
@@ -403,14 +389,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
 
-    uint cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -419,7 +398,6 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
@@ -428,12 +406,10 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
-
     output_buf[ltid * 2 + 0] = grad_bin;
     output_buf[ltid * 2 + 1] = hess_bin;
 }
@@ -684,7 +660,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // but currently there is no easy way to access it via OpenCL.
      uint * counter_val = cnt_hist;     
      // backup the old value
-     uint old_val = *counter_val;
      if (ltid == 0) {
          // all workgroups processing the same feature add this counter
          *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
@@ -702,7 +677,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // only 1 work group, no need to increase counter
      // the reduction will become a simple copy
      if (1) {
-         uint old_val; // dummy
  #endif
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
@@ -715,7 +689,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
          
-         within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
+         within_kernel_reduction64x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
      }
 }
 
@@ -743,7 +717,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist,
                            const uint skip_id,
-                           const uint old_val_cont_bin0,
                            const ushort num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
@@ -752,14 +725,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
 
-    uint cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -768,7 +734,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
@@ -777,12 +742,10 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
-
     output_buf[ltid * 2 + 0] = grad_bin;
     output_buf[ltid * 2 + 1] = hess_bin;
 }
@@ -1033,7 +996,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // but currently there is no easy way to access it via OpenCL.
      uint * counter_val = cnt_hist;     
      // backup the old value
-     uint old_val = *counter_val;
      if (ltid == 0) {
          // all workgroups processing the same feature add this counter
          *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
@@ -1051,7 +1013,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // only 1 work group, no need to increase counter
      // the reduction will become a simple copy
      if (1) {
-         uint old_val; // dummy
  #endif
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
@@ -1064,7 +1025,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
          
-         within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
+         within_kernel_reduction256x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
      }
 }
 

From 0fb433f9a7ce14c8363ccd03a4e55bac1d6a5a46 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 27 Apr 2020 03:55:04 +0000
Subject: [PATCH 031/119] Initial CUDA work

---
 .../kernels/histogram_16_64_256.cu            | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index bb8bed2db79..488b82f20f1 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -289,13 +289,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
-     // write counts
-     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
-     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-         // FIXME: 2-way bank conflict
-         uint value = cnt_hist[i];
-         ptr_i[i] = value;
-     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -625,13 +618,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
-     // write counts
-     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
-     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-         // FIXME: 2-way bank conflict
-         uint value = cnt_hist[i];
-         ptr_i[i] = value;
-     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -961,13 +947,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
-     // write counts
-     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
-     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-         // FIXME: 2-way bank conflict
-         uint value = cnt_hist[i];
-         ptr_i[i] = value;
-     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();

From 44819a4a3360968384fd0b55f7de26ccdf2598fe Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 28 Apr 2020 14:25:32 +0000
Subject: [PATCH 032/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 55 ++++++++++++---------------
 1 file changed, 24 insertions(+), 31 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index f1e7241a52f..f9c309394d7 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -84,60 +84,53 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian,
 #if GPU_DEBUG > 0
 
 void PrintHistograms(hist_t* h, size_t size) {
-  size_t total = 0;
+  double total_hess = 0;
   for (size_t i = 0; i < size; ++i) {
-    printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt);
-    total += h[i].cnt;
-    if ((i & 3) == 3)
+    printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
+    if ((i & 2) == 2)
         printf("\n");
+    total_hess += GET_HESS(h, i);
   }
-  printf("\nTotal examples: %lu\n", total);
+  printf("\nSum hessians: %9.3g\n", total_hess);
 }
 
-union Float_t
-{
+union Float_t {
     int64_t i;
     double f;
     static int64_t ulp_diff(Float_t a, Float_t b) {
       return abs(a.i - b.i);
     }
 };
-  
 
-void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
 
+void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
   size_t i;
   Float_t a, b;
   for (i = 0; i < size; ++i) {
-    a.f = h1[i].sum_gradients;
-    b.f = h2[i].sum_gradients;
+    a.f = GET_GRAD(h1, i);
+    b.f = GET_GRAD(h2, i);
     int32_t ulps = Float_t::ulp_diff(a, b);
-    if (fabs(h1[i].cnt           - h2[i].cnt != 0)) {
-      printf("idx: %lu, %d != %d, (diff: %d, err_rate: %f)\n", i, h1[i].cnt, h2[i].cnt, h1[i].cnt - h2[i].cnt, (float)(h1[i].cnt - h2[i].cnt)/h2[i].cnt);
-      goto err;
-    } else {
-      printf("idx: %lu, %d == %d\n", i, h1[i].cnt, h2[i].cnt);
-      printf("idx: %lu, pass\n", i);
-    }
     if (ulps > 0) {
-      printf("idx: %ld, grad %g != %g\n", i, h1[i].sum_gradients, h2[i].sum_gradients);
-      //printf("idx: %ld, grad %g != %g (%d ULPs)\n", i, h1[i].sum_gradients, h2[i].sum_gradients, ulps);
-      goto err;
+      // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
+      // goto err;
     }
-    a.f = h1[i].sum_hessians;
-    b.f = h2[i].sum_hessians;
+    a.f = GET_HESS(h1, i);
+    b.f = GET_HESS(h2, i);
     ulps = Float_t::ulp_diff(a, b);
-    if (ulps > 0) {
-      printf("idx: %ld, hessian %g != %g\n", i, h1[i].sum_hessians, h2[i].sum_hessians);
-      //printf("idx: %ld, hessian %g != %g (%d ULPs)\n", i, h1[i].sum_hessians, h2[i].sum_hessians, ulps);
-      // goto err;
+    if (std::fabs(a.f - b.f) >= 1e-20) {
+      printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps);
+      goto err;
     }
   }
   return;
 err:
   Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i);
+  std::cin.get();
+  PrintHistograms(h1, size);
+  printf("\n");
+  PrintHistograms(h2, size);
+  std::cin.get();
 }
-
 #endif
 
 int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
@@ -1037,17 +1030,17 @@ void CUDATreeLearner::FindBestSplits() {
 
 #if GPU_DEBUG >= 3
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
-    if (!is_feature_used_[feature_index]) continue;
+    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
     if (parent_leaf_histogram_array_ != nullptr
         && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
       smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
       continue;
     }
     size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; 
-    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d smaller leaf:\n", feature_index, bin_size);
+    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd smaller leaf:\n", feature_index, bin_size);
     PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
     if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; }
-    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d larger leaf:\n", feature_index, bin_size);
+    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd larger leaf:\n", feature_index, bin_size);
 
     PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
   }

From a668c8e822ca41b419b1107c8d396f8070faec29 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 28 Apr 2020 16:06:19 +0000
Subject: [PATCH 033/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index f9c309394d7..42a45a55fac 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -931,23 +931,28 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       continue;
     int dense_feature_group_index = dense_feature_group_map_[i];
     size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
-    HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
-    HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
-    HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size];
+    hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
+    hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
+    hist_t* gpu_histogram = new hist_t[size * 2];
     data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
-
-    std::copy(current_histogram, current_histogram + size, gpu_histogram);
-    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry));
+    printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
+    std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
+    std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
+    if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) {
+      continue;
+    }
     if ( num_data == num_data_ ) {
       if ( is_constant_hessian_ ) {
         printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+            0,
             num_data,
             gradients_,
             current_histogram);
       } else {
         printf("ConstructHistogram(): num_data == num_data_ ");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+            0,
             num_data,
             gradients_, hessians_,
             current_histogram);
@@ -957,6 +962,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
         printf("ConstructHistogram(): is_constant_hessian_");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
+            0,
             num_data,
             ordered_gradients_.data(),
             current_histogram);
@@ -964,6 +970,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
         printf("ConstructHistogram(): 4");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
+            0,
             num_data,
             ordered_gradients_.data(), ordered_hessians_.data(),
             current_histogram);
@@ -974,7 +981,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
         compare = false;
     }
     CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
-    std::copy(gpu_histogram, gpu_histogram + size, current_histogram);
+    std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
     delete [] gpu_histogram;
     //break; // LGBM_CUDA: see only first feature info
   }

From 0e8cd92553f0a0f8ceb4bdf07f39be632402f880 Mon Sep 17 00:00:00 2001
From: ChipKerchner <ckerchne@linux.vnet.ibm.com>
Date: Tue, 28 Apr 2020 16:11:59 -0400
Subject: [PATCH 034/119] Initial CUDA work

---
 src/treelearner/gpu_tree_learner.cpp | 9 +++++----
 src/treelearner/gpu_tree_learner.h   | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index 43ccadfd176..7f336b075d4 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -36,9 +36,9 @@ GPUTreeLearner::~GPUTreeLearner() {
   }
 }
 
-void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
+void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
   // initialize SerialTreeLearner
-  SerialTreeLearner::Init(train_data, is_constant_hessian);
+  SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset);
   // some additional variables needed for GPU trainer
   num_feature_groups_ = train_data_->num_feature_groups();
   // Initialize GPU buffers and kernels
@@ -734,8 +734,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
   SetupKernelArguments();
 }
 
-Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
-  return SerialTreeLearner::Train(gradients, hessians);
+Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians,
+                           bool is_constant_hessian, Json& forced_split_json) {
+  return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json);
 }
 
 void GPUTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h
index ba48f030441..598e8d40ac9 100644
--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -45,7 +45,7 @@ class GPUTreeLearner: public SerialTreeLearner {
  public:
   explicit GPUTreeLearner(const Config* tree_config);
   ~GPUTreeLearner();
-  void Init(const Dataset* train_data, bool is_constant_hessian) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;
   void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
   void ResetIsConstantHessian(bool is_constant_hessian) override;
   Tree* Train(const score_t* gradients, const score_t *hessians,

From 0cbe79d70e0fb9f96fa1d1eb33b174237201f904 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Sat, 2 May 2020 02:21:12 +0000
Subject: [PATCH 035/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp   | 88 ++++++++++++++-----------
 src/treelearner/feature_histogram.hpp   |  3 +-
 src/treelearner/serial_tree_learner.cpp |  6 +-
 3 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 42a45a55fac..a11255eba9e 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -103,33 +103,41 @@ union Float_t {
 };
 
 
-void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
-  size_t i;
-  Float_t a, b;
-  for (i = 0; i < size; ++i) {
-    a.f = GET_GRAD(h1, i);
-    b.f = GET_GRAD(h2, i);
-    int32_t ulps = Float_t::ulp_diff(a, b);
-    if (ulps > 0) {
-      // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
-      // goto err;
+void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag) {
+  int i;
+  printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size);
+  if (dp_flag) { // double precision
+    double a, b;
+    for (i = 0; i < (int) size; ++i) {
+      a = GET_GRAD(h1, i);
+      b = GET_GRAD(h2, i);
+      if (((std::fabs(a - b))/a) >= 1e-6) {
+        printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, a, b);
+      }
+      a = GET_HESS(h1, i);
+      b = (double) GET_HESS(((long long int *) h2), i); // GCF HACK becuse CPU hessians are apparently stored as long long ints
+      if (((std::fabs(a - b))/a) >= 1e-6) {
+        printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, a, b);
+      }
     }
-    a.f = GET_HESS(h1, i);
-    b.f = GET_HESS(h2, i);
-    ulps = Float_t::ulp_diff(a, b);
-    if (std::fabs(a.f - b.f) >= 1e-20) {
-      printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps);
-      goto err;
+  }
+  else { // single precision
+    float a, b;
+    for (i = 0; i < (int) size; ++i) {
+      a = GET_GRAD(h1, i);
+      b = GET_GRAD(h2, i);
+      if (((std::fabs(a - b))/a) >= 1e-5) {
+        printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, a, b);
+      }
+      a = GET_HESS(h1, i);
+      b = GET_HESS(h2, i);
+      if (((std::fabs(a - b))/a) >= 1e-5) {
+        printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, a, b);
+      }
     }
   }
+  printf("DONE Comparing Histograms...\n");
   return;
-err:
-  Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i);
-  std::cin.get();
-  PrintHistograms(h1, size);
-  printf("\n");
-  PrintHistograms(h2, size);
-  std::cin.get();
 }
 #endif
 
@@ -204,7 +212,8 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
     size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
     size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
 
-    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
+    //CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
+    CUDASUCCESS_OR_FATAL(cudaMemcpy((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost));
     CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id]));
   }
 }
@@ -230,7 +239,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) {
       continue;
     }
     int dense_group_index = dense_feature_group_map_[i];
-    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
+    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
     int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); 
 
     for (int j = 0; j < bin_size; ++j) {
@@ -471,6 +480,9 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
     #endif
     max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
   }
+  #if GPU_DEBUG >= 1
+  printf("\n");
+  #endif
 
   if (max_num_bin_ <= 16) {
     device_bin_size_ = 16; //LGBM_CUDA
@@ -831,7 +843,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
     printf("%d ", feature_masks_[i]);
   }
   printf("\n");
-  printf("CudaTreeLearner::ConstructGPUHistogramsAsync() %d feature groups, %d used, %d\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features);
+  printf("CudaTreeLearner::ConstructGPUHistogramsAsync() %d feature groups, %d used, %d use_all_features\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features);
 #endif
 
   // if not all feature groups are used, we need to transfer the feature mask to GPU
@@ -877,7 +889,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   }
 
   // construct smaller leaf
-  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; 
 
   // Check workgroups per feature4 tuple..
   int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
@@ -924,7 +936,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
   // #define GPU_DEBUG_COMPARE
 #ifdef GPU_DEBUG_COMPARE
-  printf("Start Comparing_Histogram between GPU and CPU num_dense_feature_groups_=%d\n",num_dense_feature_groups_);
+  printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n",num_dense_feature_groups_);
   bool compare = true;
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (!feature_masks_[i])
@@ -935,7 +947,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
     hist_t* gpu_histogram = new hist_t[size * 2];
     data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
-    printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
+    printf("Comparing histogram for feature %d, size %d, %lu bins\n", dense_feature_group_index, num_data, size);
     std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
     std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
     if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) {
@@ -943,14 +955,14 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     }
     if ( num_data == num_data_ ) {
       if ( is_constant_hessian_ ) {
-        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_");
+        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             0,
             num_data,
             gradients_,
             current_histogram);
       } else {
-        printf("ConstructHistogram(): num_data == num_data_ ");
+        printf("ConstructHistogram(): num_data == num_data_\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             0,
             num_data,
@@ -959,7 +971,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       }
     } else {
       if ( is_constant_hessian_ ) {
-        printf("ConstructHistogram(): is_constant_hessian_");
+        printf("ConstructHistogram(): is_constant_hessian_\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
             0,
@@ -967,7 +979,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             ordered_gradients_.data(),
             current_histogram);
       } else {  
-        printf("ConstructHistogram(): 4");
+        printf("ConstructHistogram(): 4\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
             0,
@@ -977,10 +989,10 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       }
     }
     if ( (num_data != num_data_) && compare ) {
-        CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
+        CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp);
         compare = false;
     }
-    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
+    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp);
     std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
     delete [] gpu_histogram;
     //break; // LGBM_CUDA: see only first feature info
@@ -993,7 +1005,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 
     // construct larger leaf
 
-    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
+    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
 
     is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
       larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf());
@@ -1009,11 +1021,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     //  ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
     //  ptr_larger_leaf_hist_data);
     train_data_->ConstructHistograms(is_sparse_feature_used,
-      smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
+      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
       gradients_, hessians_,
       ordered_gradients_.data(), ordered_hessians_.data(),
       share_state_.get(),
-      ptr_smaller_leaf_hist_data);
+      ptr_larger_leaf_hist_data);
     }
 
     // wait for GPU to finish, only if GPU is actually used
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index 8916ee48fd4..bf3d81c53d8 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -1201,7 +1201,8 @@ class HistogramPool {
     for (int i = old_cache_size; i < cache_size; ++i) {
       OMP_LOOP_EX_BEGIN();
       pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
-      data_[i].resize(num_total_bin * 2);
+      //data_[i].resize(num_total_bin * 2);
+      data_[i].resize(num_total_bin * 3); // GCF HACK to avoid mysterious core dumps
       for (int j = 0; j < train_data->num_features(); ++j) {
         pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
       }
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 3c9faa84ff5..e5d2a64ceaf 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -353,8 +353,7 @@ void SerialTreeLearner::ConstructHistograms(
   Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
                                   global_timer);
   // construct smaller leaf
-  hist_t* ptr_smaller_leaf_hist_data =
-      smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
+  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
   train_data_->ConstructHistograms(
       is_feature_used, smaller_leaf_splits_->data_indices(),
       smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
@@ -363,8 +362,7 @@ void SerialTreeLearner::ConstructHistograms(
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
     // construct larger leaf
-    hist_t* ptr_larger_leaf_hist_data =
-        larger_leaf_histogram_array_[0].RawData() - kHistOffset;
+    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
     train_data_->ConstructHistograms(
         is_feature_used, larger_leaf_splits_->data_indices(),
         larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,

From 252f465fbcb61a5458fe126ede4e0f76be6aaadb Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 4 May 2020 16:44:32 +0000
Subject: [PATCH 036/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index a11255eba9e..85e90e402e5 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -877,6 +877,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   #pragma omp parallel for schedule(static)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
 
+    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
     if (!is_feature_used[feature_index]) continue;
     if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
       is_sparse_feature_used[feature_index] = 1;

From f7d8fb4400257426a646f1736260cbe412e5992e Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 4 May 2020 21:24:11 +0000
Subject: [PATCH 037/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp         | 88 +++++++++++++------
 .../kernels/histogram_16_64_256.cu            | 83 ++++++++++++++++-
 2 files changed, 140 insertions(+), 31 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 85e90e402e5..405cbc89022 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -102,42 +102,68 @@ union Float_t {
     }
 };
 
-
-void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag) {
+int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) {
   int i;
+  int retval = 0;
   printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size);
   if (dp_flag) { // double precision
-    double a, b;
+    double af, bf;
+    long long int ai, bi;
     for (i = 0; i < (int) size; ++i) {
-      a = GET_GRAD(h1, i);
-      b = GET_GRAD(h2, i);
-      if (((std::fabs(a - b))/a) >= 1e-6) {
-        printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, a, b);
+      af = GET_GRAD(h1, i);
+      bf = GET_GRAD(h2, i);
+      if (((std::fabs(af - bf))/af) >= 1e-6) {
+        printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf);
+        ++retval;
+      }
+      if (const_flag) {
+        ai = GET_HESS(((long long int *) h1), i);
+        bi = GET_HESS(((long long int *) h2), i);
+        if (ai != bi) {
+          printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi);
+          ++retval;
+        }
       }
-      a = GET_HESS(h1, i);
-      b = (double) GET_HESS(((long long int *) h2), i); // GCF HACK becuse CPU hessians are apparently stored as long long ints
-      if (((std::fabs(a - b))/a) >= 1e-6) {
-        printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, a, b);
+      else {
+        af = GET_HESS(h1, i);
+        bf = GET_HESS(h2, i);
+        if (((std::fabs(af - bf))/af) >= 1e-6) {
+          printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, af, bf);
+          ++retval;
+        }
       }
     }
   }
   else { // single precision
-    float a, b;
+    float af, bf;
+    int ai, bi;
     for (i = 0; i < (int) size; ++i) {
-      a = GET_GRAD(h1, i);
-      b = GET_GRAD(h2, i);
-      if (((std::fabs(a - b))/a) >= 1e-5) {
-        printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, a, b);
+      af = GET_GRAD(h1, i);
+      bf = GET_GRAD(h2, i);
+      if (((std::fabs(af - bf))/af) >= 1e-5) {
+        printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf);
+        ++retval;
+      }
+      if (const_flag) {
+        ai = GET_HESS(h1, i);
+        bi = GET_HESS(h2, i);
+        if (ai != bi) {
+          printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi);
+          ++retval;
+        }
       }
-      a = GET_HESS(h1, i);
-      b = GET_HESS(h2, i);
-      if (((std::fabs(a - b))/a) >= 1e-5) {
-        printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, a, b);
+      else {
+        af = GET_HESS(h1, i);
+        bf = GET_HESS(h2, i);
+        if (((std::fabs(af - bf))/af) >= 1e-5) {
+          printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, af, bf);
+          ++retval;
+        }
       }
     }
   }
   printf("DONE Comparing Histograms...\n");
-  return;
+  return retval;
 }
 #endif
 
@@ -948,7 +974,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
     hist_t* gpu_histogram = new hist_t[size * 2];
     data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
-    printf("Comparing histogram for feature %d, size %d, %lu bins\n", dense_feature_group_index, num_data, size);
+    printf("Comparing histogram for feature %d, num_data %d, num_data_ = %d, %lu bins\n", dense_feature_group_index, num_data, num_data_, size);
     std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
     std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
     if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) {
@@ -980,7 +1006,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             ordered_gradients_.data(),
             current_histogram);
       } else {  
-        printf("ConstructHistogram(): 4\n");
+        printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_);
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
             0,
@@ -989,11 +1015,19 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             current_histogram);
       }
     }
+    int retval;
     if ( (num_data != num_data_) && compare ) {
-        CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp);
+        retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
+        if (retval < 4) printf("CompareHistograms reports only %d errors\n", retval);
         compare = false;
     }
-    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp);
+    retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
+    if (num_data == num_data_) {
+        if (retval > 1) printf("CompareHistograms reports %d errors\n", retval);
+    }
+    else {
+        if (retval < 3) printf("CompareHistograms reports only %d errors\n", retval);
+    }
     std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
     delete [] gpu_histogram;
     //break; // LGBM_CUDA: see only first feature info
@@ -1058,11 +1092,11 @@ void CUDATreeLearner::FindBestSplits() {
     }
     size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; 
     printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd smaller leaf:\n", feature_index, bin_size);
-    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
+    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
     if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; }
     printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd larger leaf:\n", feature_index, bin_size);
 
-    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
+    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset1, bin_size);
   }
 #endif
 }
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 488b82f20f1..5ea8721cf99 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -45,6 +45,7 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const float val)
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist,
                            const uint skip_id,
+                           const uint old_val_cont_bin0,
                            const ushort num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
@@ -53,7 +54,14 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
+    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
 
+    uint cont_bin;
+    if (power_feature_workgroups != 0) {
+      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
+    } else {
+      cont_bin = local_cnt[ltid];
+    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -62,6 +70,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
@@ -70,12 +79,19 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
+
     output_buf[ltid * 2 + 0] = grad_bin;
+#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
+#else
+    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
+#endif
+    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -289,6 +305,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
+     // write counts
+     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
+     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+         // FIXME: 2-way bank conflict
+         uint value = cnt_hist[i];
+         ptr_i[i] = value;
+     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -317,6 +340,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // but currently there is no easy way to access it via OpenCL.
      uint * counter_val = cnt_hist;     
      // backup the old value
+     uint old_val = *counter_val;
      if (ltid == 0) {
          // all workgroups processing the same feature add this counter
          *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
@@ -334,6 +358,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // only 1 work group, no need to increase counter
      // the reduction will become a simple copy
      if (1) {
+         uint old_val; // dummy
  #endif
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
@@ -346,7 +371,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
          
-         within_kernel_reduction16x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
+         within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
      }
 }
 
@@ -374,6 +399,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist,
                            const uint skip_id,
+                           const uint old_val_cont_bin0,
                            const ushort num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
@@ -382,7 +408,14 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
+    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
 
+    uint cont_bin;
+    if (power_feature_workgroups != 0) {
+      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
+    } else {
+      cont_bin = local_cnt[ltid];
+    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -391,6 +424,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
@@ -399,12 +433,19 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
+
     output_buf[ltid * 2 + 0] = grad_bin;
-    output_buf[ltid * 2 + 1] = hess_bin;
+#if CONST_HESSIAN == 0
+    output_buf[ltid * 2 + 1] = hess_bin;                              
+#else
+    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
+#endif
+    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -618,6 +659,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
+     // write counts
+     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
+     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+         // FIXME: 2-way bank conflict
+         uint value = cnt_hist[i];
+         ptr_i[i] = value;
+     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -646,6 +694,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // but currently there is no easy way to access it via OpenCL.
      uint * counter_val = cnt_hist;     
      // backup the old value
+     uint old_val = *counter_val;
      if (ltid == 0) {
          // all workgroups processing the same feature add this counter
          *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
@@ -663,6 +712,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // only 1 work group, no need to increase counter
      // the reduction will become a simple copy
      if (1) {
+         uint old_val; // dummy
  #endif
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
@@ -675,7 +725,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
          
-         within_kernel_reduction64x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
+         within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
      }
 }
 
@@ -703,6 +753,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist,
                            const uint skip_id,
+                           const uint old_val_cont_bin0,
                            const ushort num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
@@ -711,7 +762,14 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
+    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
 
+    uint cont_bin;
+    if (power_feature_workgroups != 0) {
+      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
+    } else {
+      cont_bin = local_cnt[ltid];
+    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -720,6 +778,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
@@ -728,12 +787,19 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
+
     output_buf[ltid * 2 + 0] = grad_bin;
+#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
+#else
+    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
+#endif
+    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -947,6 +1013,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
+     // write counts
+     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
+     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+         // FIXME: 2-way bank conflict
+         uint value = cnt_hist[i];
+         ptr_i[i] = value;
+     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -975,6 +1048,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // but currently there is no easy way to access it via OpenCL.
      uint * counter_val = cnt_hist;     
      // backup the old value
+     uint old_val = *counter_val;
      if (ltid == 0) {
          // all workgroups processing the same feature add this counter
          *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
@@ -992,6 +1066,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // only 1 work group, no need to increase counter
      // the reduction will become a simple copy
      if (1) {
+         uint old_val; // dummy
  #endif
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
@@ -1004,7 +1079,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
          
-         within_kernel_reduction256x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
+         within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
      }
 }
 

From be09b8f64e71db9466b5014eba7430b564196544 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Tue, 5 May 2020 03:55:33 +0000
Subject: [PATCH 038/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp         |  4 ++--
 .../kernels/histogram_16_64_256.cu            | 24 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 405cbc89022..4c1f57cf998 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -208,7 +208,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
     if (num_workgroups > preallocd_max_num_wg_[device_id]) {
       preallocd_max_num_wg_.at(device_id) = num_workgroups;
       CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
-      cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_);
+      cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2));
     }
     //set thread_data
     SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features,
@@ -413,7 +413,7 @@ void CUDATreeLearner::AllocateGPUMemory() {
       if (!device_subhistograms_[device_id]) {
 
   // only initialize once here, as this will not need to change when ResetTrainingData() is called
-        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_));
+        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)));
 
         Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]);
 
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 5ea8721cf99..02a02fed745 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -74,7 +74,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
         }
 
         // skip the counters we already have
-        p += 2 * NUM_BINS;  
+        p += 3 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
@@ -91,7 +91,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
 #else
     output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 #endif
-    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
+    //output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -296,7 +296,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -363,7 +363,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;
@@ -428,7 +428,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
         }
 
         // skip the counters we already have
-        p += 2 * NUM_BINS;  
+        p += 3 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
@@ -445,7 +445,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
 #else
     output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 #endif
-    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
+//    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -650,7 +650,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -717,7 +717,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;
@@ -782,7 +782,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         }
 
         // skip the counters we already have
-        p += 2 * NUM_BINS;  
+        p += 3 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
@@ -799,7 +799,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
 #else
     output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 #endif
-    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
+//    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -1004,7 +1004,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -1071,7 +1071,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;

From 3d6addd279c7990c0a06b3d18264ee28b6b91754 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Tue, 5 May 2020 15:46:05 +0000
Subject: [PATCH 039/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 4c1f57cf998..d84175f3377 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -1115,8 +1115,8 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ
         Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
       }
     } else {
-      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
-      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
+      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, best_split_info.right_output);
+      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, best_split_info.left_output);
       if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
           (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
         Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());

From 0025bed6b90de37e5d1c6b5b60911c7c5309a8fb Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Fri, 8 May 2020 00:36:07 +0000
Subject: [PATCH 040/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp         |   8 +-
 .../kernels/histogram_16_64_256.cu            | 105 ++++--------------
 2 files changed, 28 insertions(+), 85 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index d84175f3377..7e2d7418a2b 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -208,7 +208,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
     if (num_workgroups > preallocd_max_num_wg_[device_id]) {
       preallocd_max_num_wg_.at(device_id) = num_workgroups;
       CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
-      cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2));
+      cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_);
     }
     //set thread_data
     SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features,
@@ -413,7 +413,7 @@ void CUDATreeLearner::AllocateGPUMemory() {
       if (!device_subhistograms_[device_id]) {
 
   // only initialize once here, as this will not need to change when ResetTrainingData() is called
-        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)));
+        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_));
 
         Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]);
 
@@ -1003,7 +1003,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             smaller_leaf_splits_->data_indices(),
             0,
             num_data,
-            ordered_gradients_.data(),
+            gradients_,
             current_histogram);
       } else {  
         printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_);
@@ -1011,7 +1011,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             smaller_leaf_splits_->data_indices(),
             0,
             num_data,
-            ordered_gradients_.data(), ordered_hessians_.data(),
+            gradients_, hessians_,
             current_histogram);
       }
     }
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 02a02fed745..e0ac3abfc2d 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -54,14 +54,6 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
-
-    uint cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -70,28 +62,20 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
-        p += 3 * NUM_BINS;  
+        p += 2 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
-
     output_buf[ltid * 2 + 0] = grad_bin;
-#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
-#else
-    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
-#endif
-    //output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -152,7 +136,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 
      // counter histogram
      // total size: 256 * size_of(uint) = 1 KB
+     #if CONST_HESSIAN == 1
      uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
+     #endif
 
      // odd threads (1, 3, ...) compute histograms for hessians first
      // even thread (0, 2, ...) compute histograms for gradients first
@@ -259,8 +245,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // prefetch the next iteration variables
          feature_next = feature_data[ind_next >> feature_mask];
 
+         #if CONST_HESSIAN == 1
          // STAGE 3: accumulate counter
          atomicAdd(cnt_hist + feature, 1);
+         #endif
 
          // STAGE 4: update next stat
          grad = grad_next;
@@ -296,7 +284,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -305,13 +293,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
-     // write counts
-     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
-     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-         // FIXME: 2-way bank conflict
-         uint value = cnt_hist[i];
-         ptr_i[i] = value;
-     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -338,7 +319,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // The is done by using an global atomic counter.
      // On AMD GPUs ideally this should be done in GDS,
      // but currently there is no easy way to access it via OpenCL.
-     uint * counter_val = cnt_hist;     
+     uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS);
      // backup the old value
      uint old_val = *counter_val;
      if (ltid == 0) {
@@ -363,7 +344,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;
@@ -408,14 +389,6 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
-
-    uint cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -424,28 +397,20 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
-        p += 3 * NUM_BINS;  
+        p += 2 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
-
     output_buf[ltid * 2 + 0] = grad_bin;
-#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;                              
-#else
-    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
-#endif
-//    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -506,7 +471,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 
      // counter histogram
      // total size: 256 * size_of(uint) = 1 KB
+     #if CONST_HESSIAN == 1
      uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
+     #endif
 
      // odd threads (1, 3, ...) compute histograms for hessians first
      // even thread (0, 2, ...) compute histograms for gradients first
@@ -613,8 +580,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // prefetch the next iteration variables
          feature_next = feature_data[ind_next >> feature_mask];
 
+         #if CONST_HESSIAN == 1
          // STAGE 3: accumulate counter
          atomicAdd(cnt_hist + feature, 1);
+         #endif
 
          // STAGE 4: update next stat
          grad = grad_next;
@@ -650,7 +619,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -659,13 +628,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
-     // write counts
-     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
-     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-         // FIXME: 2-way bank conflict
-         uint value = cnt_hist[i];
-         ptr_i[i] = value;
-     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -692,7 +654,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // The is done by using an global atomic counter.
      // On AMD GPUs ideally this should be done in GDS,
      // but currently there is no easy way to access it via OpenCL.
-     uint * counter_val = cnt_hist;     
+     uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS);
      // backup the old value
      uint old_val = *counter_val;
      if (ltid == 0) {
@@ -717,7 +679,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;
@@ -762,14 +724,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
-
-    uint cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -778,28 +732,20 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
-        p += 3 * NUM_BINS;  
+        p += 2 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
-
     output_buf[ltid * 2 + 0] = grad_bin;
-#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
-#else
-    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
-#endif
-//    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -860,7 +806,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 
      // counter histogram
      // total size: 256 * size_of(uint) = 1 KB
+     #if CONST_HESSIAN == 1
      uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
+     #endif
 
      // odd threads (1, 3, ...) compute histograms for hessians first
      // even thread (0, 2, ...) compute histograms for gradients first
@@ -967,8 +915,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // prefetch the next iteration variables
          feature_next = feature_data[ind_next >> feature_mask];
 
+         #if CONST_HESSIAN == 1
          // STAGE 3: accumulate counter
          atomicAdd(cnt_hist + feature, 1);
+         #endif
 
          // STAGE 4: update next stat
          grad = grad_next;
@@ -1004,7 +954,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -1013,13 +963,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
-     // write counts
-     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
-     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-         // FIXME: 2-way bank conflict
-         uint value = cnt_hist[i];
-         ptr_i[i] = value;
-     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -1046,7 +989,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // The is done by using an global atomic counter.
      // On AMD GPUs ideally this should be done in GDS,
      // but currently there is no easy way to access it via OpenCL.
-     uint * counter_val = cnt_hist;     
+     uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS);
      // backup the old value
      uint old_val = *counter_val;
      if (ltid == 0) {
@@ -1071,7 +1014,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;

From b70604d5395dfc3472b394caeec17d892aaaa536 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Fri, 8 May 2020 01:31:18 +0000
Subject: [PATCH 041/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 48 ++-------------------------
 1 file changed, 3 insertions(+), 45 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 7e2d7418a2b..e12b15075d0 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -102,9 +102,8 @@ union Float_t {
     }
 };
 
-int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) {
+void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) {
   int i;
-  int retval = 0;
   printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size);
   if (dp_flag) { // double precision
     double af, bf;
@@ -114,14 +113,12 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
       bf = GET_GRAD(h2, i);
       if (((std::fabs(af - bf))/af) >= 1e-6) {
         printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf);
-        ++retval;
       }
       if (const_flag) {
         ai = GET_HESS(((long long int *) h1), i);
         bi = GET_HESS(((long long int *) h2), i);
         if (ai != bi) {
           printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi);
-          ++retval;
         }
       }
       else {
@@ -129,7 +126,6 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
         bf = GET_HESS(h2, i);
         if (((std::fabs(af - bf))/af) >= 1e-6) {
           printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, af, bf);
-          ++retval;
         }
       }
     }
@@ -142,14 +138,12 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
       bf = GET_GRAD(h2, i);
       if (((std::fabs(af - bf))/af) >= 1e-5) {
         printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf);
-        ++retval;
       }
       if (const_flag) {
         ai = GET_HESS(h1, i);
         bi = GET_HESS(h2, i);
         if (ai != bi) {
           printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi);
-          ++retval;
         }
       }
       else {
@@ -157,13 +151,11 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
         bf = GET_HESS(h2, i);
         if (((std::fabs(af - bf))/af) >= 1e-5) {
           printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, af, bf);
-          ++retval;
         }
       }
     }
   }
   printf("DONE Comparing Histograms...\n");
-  return retval;
 }
 #endif
 
@@ -962,9 +954,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 
   // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
   // #define GPU_DEBUG_COMPARE
-#ifdef GPU_DEBUG_COMPARE
+  #ifdef GPU_DEBUG_COMPARE
   printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n",num_dense_feature_groups_);
-  bool compare = true;
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (!feature_masks_[i])
       continue;
@@ -981,31 +972,13 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       continue;
     }
     if ( num_data == num_data_ ) {
-      if ( is_constant_hessian_ ) {
-        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n");
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            0,
-            num_data,
-            gradients_,
-            current_histogram);
-      } else {
         printf("ConstructHistogram(): num_data == num_data_\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             0,
             num_data,
             gradients_, hessians_,
             current_histogram);
-      }
     } else {
-      if ( is_constant_hessian_ ) {
-        printf("ConstructHistogram(): is_constant_hessian_\n");
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            smaller_leaf_splits_->data_indices(),
-            0,
-            num_data,
-            gradients_,
-            current_histogram);
-      } else {  
         printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_);
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
@@ -1013,27 +986,12 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             num_data,
             gradients_, hessians_,
             current_histogram);
-      }
-    }
-    int retval;
-    if ( (num_data != num_data_) && compare ) {
-        retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
-        if (retval < 4) printf("CompareHistograms reports only %d errors\n", retval);
-        compare = false;
-    }
-    retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
-    if (num_data == num_data_) {
-        if (retval > 1) printf("CompareHistograms reports %d errors\n", retval);
-    }
-    else {
-        if (retval < 3) printf("CompareHistograms reports only %d errors\n", retval);
     }
+    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
     std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
     delete [] gpu_histogram;
-    //break; // LGBM_CUDA: see only first feature info
   }
   printf("End Comparing Histogram between GPU and CPU\n");
-//  #endif
 #endif
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {

From d92739df3052b3df6bedf66940575e87afb739ac Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 11 May 2020 18:25:59 +0000
Subject: [PATCH 042/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp         |  52 +++++++-
 .../kernels/histogram_16_64_256.cu            | 117 +++++++++++++-----
 2 files changed, 134 insertions(+), 35 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index e12b15075d0..53eec14fbc2 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -102,8 +102,9 @@ union Float_t {
     }
 };
 
-void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) {
+int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) {
   int i;
+  int retval = 0;
   printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size);
   if (dp_flag) { // double precision
     double af, bf;
@@ -113,12 +114,14 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int
       bf = GET_GRAD(h2, i);
       if (((std::fabs(af - bf))/af) >= 1e-6) {
         printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf);
+        ++retval;
       }
       if (const_flag) {
         ai = GET_HESS(((long long int *) h1), i);
         bi = GET_HESS(((long long int *) h2), i);
         if (ai != bi) {
           printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi);
+          ++retval;
         }
       }
       else {
@@ -126,6 +129,7 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int
         bf = GET_HESS(h2, i);
         if (((std::fabs(af - bf))/af) >= 1e-6) {
           printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, af, bf);
+          ++retval;
         }
       }
     }
@@ -138,12 +142,14 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int
       bf = GET_GRAD(h2, i);
       if (((std::fabs(af - bf))/af) >= 1e-5) {
         printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf);
+        ++retval;
       }
       if (const_flag) {
         ai = GET_HESS(h1, i);
         bi = GET_HESS(h2, i);
         if (ai != bi) {
           printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi);
+          ++retval;
         }
       }
       else {
@@ -151,11 +157,13 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int
         bf = GET_HESS(h2, i);
         if (((std::fabs(af - bf))/af) >= 1e-5) {
           printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, af, bf);
+          ++retval;
         }
       }
     }
   }
   printf("DONE Comparing Histograms...\n");
+  return retval;
 }
 #endif
 
@@ -200,7 +208,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
     if (num_workgroups > preallocd_max_num_wg_[device_id]) {
       preallocd_max_num_wg_.at(device_id) = num_workgroups;
       CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
-      cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_);
+      cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2));
     }
     //set thread_data
     SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features,
@@ -405,7 +413,7 @@ void CUDATreeLearner::AllocateGPUMemory() {
       if (!device_subhistograms_[device_id]) {
 
   // only initialize once here, as this will not need to change when ResetTrainingData() is called
-        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_));
+        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)));
 
         Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]);
 
@@ -954,8 +962,9 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 
   // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
   // #define GPU_DEBUG_COMPARE
-  #ifdef GPU_DEBUG_COMPARE
+#ifdef GPU_DEBUG_COMPARE
   printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n",num_dense_feature_groups_);
+  bool compare = true;
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (!feature_masks_[i])
       continue;
@@ -972,13 +981,31 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       continue;
     }
     if ( num_data == num_data_ ) {
+      if ( is_constant_hessian_ ) {
+        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n");
+        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+            0,
+            num_data,
+            gradients_,
+            current_histogram);
+      } else {
         printf("ConstructHistogram(): num_data == num_data_\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             0,
             num_data,
             gradients_, hessians_,
             current_histogram);
+      }
     } else {
+      if ( is_constant_hessian_ ) {
+        printf("ConstructHistogram(): is_constant_hessian_\n");
+        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+            smaller_leaf_splits_->data_indices(),
+            0,
+            num_data,
+            gradients_,
+            current_histogram);
+      } else {  
         printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_);
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
@@ -986,12 +1013,27 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             num_data,
             gradients_, hessians_,
             current_histogram);
+      }
+    }
+    int retval;
+    if ( (num_data != num_data_) && compare ) {
+        retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
+        printf("CompareHistograms reports %d errors\n", retval);
+        compare = false;
+    }
+    retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
+    if (num_data == num_data_) {
+        printf("CompareHistograms reports %d errors\n", retval);
+    }
+    else {
+        printf("CompareHistograms reports %d errors\n", retval);
     }
-    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
     std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
     delete [] gpu_histogram;
+    //break; // LGBM_CUDA: see only first feature info
   }
   printf("End Comparing Histogram between GPU and CPU\n");
+//  #endif
 #endif
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index e0ac3abfc2d..3c194a22ddf 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -54,6 +54,14 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
+    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
+
+    uint cont_bin;
+    if (power_feature_workgroups != 0) {
+      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
+    } else {
+      cont_bin = local_cnt[ltid];
+    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -62,20 +70,28 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
-        p += 2 * NUM_BINS;  
+        p += 3 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
+
     output_buf[ltid * 2 + 0] = grad_bin;
+#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
+#else
+    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
+#endif
+    //output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -136,9 +152,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 
      // counter histogram
      // total size: 256 * size_of(uint) = 1 KB
-     #if CONST_HESSIAN == 1
      uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
-     #endif
 
      // odd threads (1, 3, ...) compute histograms for hessians first
      // even thread (0, 2, ...) compute histograms for gradients first
@@ -195,9 +209,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      score_t grad, hess;
      score_t grad_next, hess_next;
      // LGBM_CUDA v5.1
-     grad = ordered_gradients[ind];
+     grad = ordered_gradients[subglobal_tid];
      #if CONST_HESSIAN == 0
-     hess = ordered_hessians[ind];
+     hess = ordered_hessians[subglobal_tid];
      #endif
 
 
@@ -245,10 +259,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // prefetch the next iteration variables
          feature_next = feature_data[ind_next >> feature_mask];
 
-         #if CONST_HESSIAN == 1
          // STAGE 3: accumulate counter
          atomicAdd(cnt_hist + feature, 1);
-         #endif
 
          // STAGE 4: update next stat
          grad = grad_next;
@@ -284,7 +296,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -293,6 +305,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
+     // write counts
+     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
+     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+         // FIXME: 2-way bank conflict
+         uint value = cnt_hist[i];
+         ptr_i[i] = value;
+     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -319,7 +338,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // The is done by using an global atomic counter.
      // On AMD GPUs ideally this should be done in GDS,
      // but currently there is no easy way to access it via OpenCL.
-     uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS);
+     uint * counter_val = cnt_hist;     
      // backup the old value
      uint old_val = *counter_val;
      if (ltid == 0) {
@@ -344,7 +363,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;
@@ -389,6 +408,14 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
+    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
+
+    uint cont_bin;
+    if (power_feature_workgroups != 0) {
+      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
+    } else {
+      cont_bin = local_cnt[ltid];
+    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -397,20 +424,28 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
-        p += 2 * NUM_BINS;  
+        p += 3 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
+
     output_buf[ltid * 2 + 0] = grad_bin;
+#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;                              
+#else
+    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
+#endif
+//    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -471,9 +506,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 
      // counter histogram
      // total size: 256 * size_of(uint) = 1 KB
-     #if CONST_HESSIAN == 1
      uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
-     #endif
 
      // odd threads (1, 3, ...) compute histograms for hessians first
      // even thread (0, 2, ...) compute histograms for gradients first
@@ -530,9 +563,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      score_t grad, hess;
      score_t grad_next, hess_next;
      // LGBM_CUDA v5.1
-     grad = ordered_gradients[ind];
+     grad = ordered_gradients[subglobal_tid];
      #if CONST_HESSIAN == 0
-     hess = ordered_hessians[ind];
+     hess = ordered_hessians[subglobal_tid];
      #endif
 
 
@@ -580,10 +613,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // prefetch the next iteration variables
          feature_next = feature_data[ind_next >> feature_mask];
 
-         #if CONST_HESSIAN == 1
          // STAGE 3: accumulate counter
          atomicAdd(cnt_hist + feature, 1);
-         #endif
 
          // STAGE 4: update next stat
          grad = grad_next;
@@ -619,7 +650,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -628,6 +659,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
+     // write counts
+     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
+     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+         // FIXME: 2-way bank conflict
+         uint value = cnt_hist[i];
+         ptr_i[i] = value;
+     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -654,7 +692,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // The is done by using an global atomic counter.
      // On AMD GPUs ideally this should be done in GDS,
      // but currently there is no easy way to access it via OpenCL.
-     uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS);
+     uint * counter_val = cnt_hist;     
      // backup the old value
      uint old_val = *counter_val;
      if (ltid == 0) {
@@ -679,7 +717,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;
@@ -724,6 +762,14 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
+    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
+
+    uint cont_bin;
+    if (power_feature_workgroups != 0) {
+      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
+    } else {
+      cont_bin = local_cnt[ltid];
+    }
     ushort i;
 
     if (power_feature_workgroups != 0) {
@@ -732,20 +778,28 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         for (i = 0; i < skip_id; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
 
         // skip the counters we already have
-        p += 2 * NUM_BINS;  
+        p += 3 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
+            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
     __syncthreads();
 
+
     output_buf[ltid * 2 + 0] = grad_bin;
+#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
+#else
+    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
+#endif
+//    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -806,9 +860,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 
      // counter histogram
      // total size: 256 * size_of(uint) = 1 KB
-     #if CONST_HESSIAN == 1
      uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
-     #endif
 
      // odd threads (1, 3, ...) compute histograms for hessians first
      // even thread (0, 2, ...) compute histograms for gradients first
@@ -865,9 +917,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      score_t grad, hess;
      score_t grad_next, hess_next;
      // LGBM_CUDA v5.1
-     grad = ordered_gradients[ind];
+     grad = ordered_gradients[subglobal_tid];
      #if CONST_HESSIAN == 0
-     hess = ordered_hessians[ind];
+     hess = ordered_hessians[subglobal_tid];
      #endif
 
 
@@ -915,10 +967,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // prefetch the next iteration variables
          feature_next = feature_data[ind_next >> feature_mask];
 
-         #if CONST_HESSIAN == 1
          // STAGE 3: accumulate counter
          atomicAdd(cnt_hist + feature, 1);
-         #endif
 
          // STAGE 4: update next stat
          grad = grad_next;
@@ -954,7 +1004,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS;
+     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
      // write gradients and hessians
      acc_type *__restrict__ ptr_f = output;
      for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
@@ -963,6 +1013,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          acc_type value = gh_hist[i];
          ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
      }
+     // write counts
+     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
+     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+         // FIXME: 2-way bank conflict
+         uint value = cnt_hist[i];
+         ptr_i[i] = value;
+     }
      // FIXME: is this right
      __syncthreads();
      __threadfence();
@@ -989,7 +1046,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // The is done by using an global atomic counter.
      // On AMD GPUs ideally this should be done in GDS,
      // but currently there is no easy way to access it via OpenCL.
-     uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS);
+     uint * counter_val = cnt_hist;     
      // backup the old value
      uint old_val = *counter_val;
      if (ltid == 0) {
@@ -1014,7 +1071,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
          acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 2 * NUM_BINS;
+                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
          // skip reading the data already in local memory
          //uint skip_id = feature_id ^ output_offset;
          uint skip_id = group_id - output_offset;

From 429e752269c98d283a51f95853c476480124c1c1 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Tue, 12 May 2020 02:48:55 +0000
Subject: [PATCH 043/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp         |  6 ++--
 .../kernels/histogram_16_64_256.cu            | 33 +++++++++----------
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 53eec14fbc2..d87e7addef1 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -112,7 +112,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
     for (i = 0; i < (int) size; ++i) {
       af = GET_GRAD(h1, i);
       bf = GET_GRAD(h2, i);
-      if (((std::fabs(af - bf))/af) >= 1e-6) {
+      if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) {
         printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf);
         ++retval;
       }
@@ -140,7 +140,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
     for (i = 0; i < (int) size; ++i) {
       af = GET_GRAD(h1, i);
       bf = GET_GRAD(h2, i);
-      if (((std::fabs(af - bf))/af) >= 1e-5) {
+      if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) {
         printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf);
         ++retval;
       }
@@ -1033,6 +1033,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     //break; // LGBM_CUDA: see only first feature info
   }
   printf("End Comparing Histogram between GPU and CPU\n");
+  fflush(stderr);
+  fflush(stdout);
 //  #endif
 #endif
 
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 3c194a22ddf..a85918cc3c6 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -91,7 +91,6 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
 #else
     output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 #endif
-    //output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -219,17 +218,18 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
          // prefetch the next iteration variables
          // we don't need bondary check because we have made the buffer large
+         int i_next = i + subglobal_size;
          #ifdef IGNORE_INDICES
          // we need to check to bounds here
-         ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
+         ind_next = i_next < num_data ? i_next : i;
          #else
-         ind_next = data_indices[i + subglobal_size];
+         ind_next = data_indices[i_next];
          #endif
 
          // imbGBT v5.1
-         grad_next = ordered_gradients[ind_next];
+         grad_next = ordered_gradients[i_next];
          #if CONST_HESSIAN == 0
-         hess_next = ordered_hessians[ind_next];
+         hess_next = ordered_hessians[i_next];
          #endif
 
          // STAGE 2: accumulate gradient and hessian
@@ -441,11 +441,10 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
 
     output_buf[ltid * 2 + 0] = grad_bin;
 #if CONST_HESSIAN == 0
-    output_buf[ltid * 2 + 1] = hess_bin;                              
+    output_buf[ltid * 2 + 1] = hess_bin;
 #else
     output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 #endif
-//    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -573,17 +572,18 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
          // prefetch the next iteration variables
          // we don't need bondary check because we have made the buffer large
+         int i_next = i + subglobal_size;
          #ifdef IGNORE_INDICES
          // we need to check to bounds here
-         ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
+         ind_next = i_next < num_data ? i_next : i;
          #else
-         ind_next = data_indices[i + subglobal_size];
+         ind_next = data_indices[i_next];
          #endif
 
          // imbGBT v5.1
-         grad_next = ordered_gradients[ind_next];
+         grad_next = ordered_gradients[i_next];
          #if CONST_HESSIAN == 0
-         hess_next = ordered_hessians[ind_next];
+         hess_next = ordered_hessians[i_next];
          #endif
 
          // STAGE 2: accumulate gradient and hessian
@@ -792,14 +792,12 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     }
     __syncthreads();
 
-
     output_buf[ltid * 2 + 0] = grad_bin;
 #if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
 #else
     output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 #endif
-//    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -927,17 +925,18 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
          // prefetch the next iteration variables
          // we don't need bondary check because we have made the buffer large
+         int i_next = i + subglobal_size;
          #ifdef IGNORE_INDICES
          // we need to check to bounds here
-         ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
+         ind_next = i_next < num_data ? i_next : i;
          #else
-         ind_next = data_indices[i + subglobal_size];
+         ind_next = data_indices[i_next];
          #endif
 
          // imbGBT v5.1
-         grad_next = ordered_gradients[ind_next];
+         grad_next = ordered_gradients[i_next];
          #if CONST_HESSIAN == 0
-         hess_next = ordered_hessians[ind_next];
+         hess_next = ordered_hessians[i_next];
          #endif
 
          // STAGE 2: accumulate gradient and hessian

From c7c22a57dc0e55f2697870766716c50409ff3736 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 15 May 2020 17:39:29 +0000
Subject: [PATCH 044/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 4 ++--
 src/treelearner/gpu_tree_learner.cpp  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index d87e7addef1..3ea95268032 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -87,7 +87,7 @@ void PrintHistograms(hist_t* h, size_t size) {
   double total_hess = 0;
   for (size_t i = 0; i < size; ++i) {
     printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
-    if ((i & 2) == 2)
+    if ((i & 3) == 3)
         printf("\n");
     total_hess += GET_HESS(h, i);
   }
@@ -1098,7 +1098,7 @@ void CUDATreeLearner::FindBestSplits() {
     if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; }
     printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd larger leaf:\n", feature_index, bin_size);
 
-    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset1, bin_size);
+    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
   }
 #endif
 }
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index 7f336b075d4..fad02e1c044 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -52,7 +52,7 @@ void PrintHistograms(hist_t* h, size_t size) {
   double total_hess = 0;
   for (size_t i = 0; i < size; ++i) {
     printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
-    if ((i & 2) == 2)
+    if ((i & 3) == 3)
         printf("\n");
     total_hess += GET_HESS(h, i);
   }
@@ -1069,10 +1069,10 @@ void GPUTreeLearner::FindBestSplits(const Tree* tree) {
     }
     size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1;
     printf("Feature %d smaller leaf:\n", feature_index);
-    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
+    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
     if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
     printf("Feature %d larger leaf:\n", feature_index);
-    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
+    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
   }
 #endif
 }

From aad98f0ed8ee94e93e1525edbde811674bdd42b9 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Sun, 24 May 2020 12:23:10 +0000
Subject: [PATCH 045/119] Initial CUDA work

---
 build_LGBM.232.sh                             |   1 +
 include/LightGBM/bin.h                        |   6 +
 include/LightGBM/feature_group.h              |   1 +
 src/boosting/gbdt.cpp                         |  24 +++-
 src/io/dataset.cpp                            |  32 ++++-
 src/io/dense_bin.hpp                          |  38 +++++
 src/io/sparse_bin.hpp                         |  31 +++++
 src/treelearner/cuda_tree_learner.cpp         |  10 +-
 .../kernels/histogram_16_64_256.cu            | 131 ++++++++++++++++--
 src/treelearner/serial_tree_learner.cpp       |  42 ++++++
 10 files changed, 297 insertions(+), 19 deletions(-)

diff --git a/build_LGBM.232.sh b/build_LGBM.232.sh
index 5e500327108..f785d6556e6 100755
--- a/build_LGBM.232.sh
+++ b/build_LGBM.232.sh
@@ -3,4 +3,5 @@ rm -rf build
 mkdir build
 cd build
 cmake -DUSE_CUDA=1 ..
+#cmake ..
 make -j40
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index e541e7039e9..c09cde3c809 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -308,6 +308,12 @@ class Bin {
   * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
   * \param out Output Result
   */
+
+  virtual void ConstructHistogramDebug(
+    data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const = 0;
+
   virtual void ConstructHistogram(
     const data_size_t* data_indices, data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index d949beec20e..d5eac42db48 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -176,6 +176,7 @@ class FeatureGroup {
 
   inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
     if (!is_multi_val_) {
+//fprintf(stderr, "CopySubrow CP1A\n"); fflush(stderr);
       bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
     } else {
       for (int i = 0; i < num_feature_; ++i) {
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 55e11312235..f02c5d940f1 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -243,8 +243,10 @@ data_size_t GBDT::BalancedBaggingHelper(data_size_t start, data_size_t cnt,
 void GBDT::Bagging(int iter) {
   Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
   // if need bagging
+fprintf(stderr, "inside GBDT::Bagging!\n"); fflush(stderr);
   if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) ||
       need_re_bagging_) {
+//fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr);
     need_re_bagging_ = false;
     auto left_cnt = bagging_runner_.Run<true>(
         num_data_,
@@ -263,7 +265,9 @@ void GBDT::Bagging(int iter) {
     bag_data_cnt_ = left_cnt;
     Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
     // set bagging data to tree learner
+//fprintf(stderr, "inside GBDT::Bagging, past second hurdle\n"); fflush(stderr);
     if (!is_use_subset_) {
+//fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr);
       tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
     } else { // LGBM_CUDA
       // NEW get subset
@@ -275,11 +279,21 @@ void GBDT::Bagging(int iter) {
         tmp_hessians_.resize(total_size);
       }
 
+//fprintf(stderr, "CopySubrow CP2, bag_data_cnt_ = %d\n", bag_data_cnt_); fflush(stderr);
+//char *temp_bag = (char *) bag_data_indices_.data();
+//for (int i=0; i<bag_data_cnt_; i+=1000) {
+//fprintf(stderr, "bag_data[%6d] = %d\n", i, (int) temp_bag[i]); 
+//}
+//fflush(stderr);
+
       tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
 
+//fprintf(stderr, "CopySubrow CP2, calling tree_learner_->ResetTrainingData\n"); fflush(stderr);
       tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_);
+//fprintf(stderr, "CopySubrow CP2, back from tree_learner_->ResetTrainingData\n"); fflush(stderr);
     }
   }
+fprintf(stderr, "returning from GBDT::Bagging!\n"); fflush(stderr);
 }
 
 void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
@@ -382,11 +396,14 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
 // LGBM_CUDA
 bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
 
+//fprintf(stderr, "inside TrainOneIterCUDA CP103\n"); fflush(stderr);
+
  // LGBM_CUDA invoke baggging during the first iteration
  if ((config_->device_type == std::string("cuda")) && (iter_ == 0)) {
 
 //    auto start_time = std::chrono::steady_clock::now();
 
+//fprintf(stderr, "calling Bagging CP104\n"); fflush(stderr);
     Bagging(0); 
   }
 
@@ -407,8 +424,9 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
     hessians = hessians_.data();
   }
 
+//fprintf(stderr, "inside TrainOneIterCUDA CP105, bagging commented out\n"); fflush(stderr);
   // LGBM_CUDA  bagging logic
-  // Bagging(iter_);
+  // Bagging(iter_); // GCF trial and error
 
   bool should_continue = false;
   for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
@@ -447,8 +465,10 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
       // LGBM_CUDA
       new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_));
     }
+//fprintf(stderr, "inside TrainOneIterCUDA, num_leaves = %d\n", new_tree->num_leaves()); fflush(stderr);
 
     if (new_tree->num_leaves() > 1) {
+//fprintf(stderr, "inside TrainOneIterCUDA CP106, this clause doesn't do bagging\n"); fflush(stderr);
       should_continue = true;
       auto score_ptr = train_score_updater_->score() + offset;
       auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast<double>(label[i]) - score_ptr[i]; };
@@ -481,12 +501,14 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
       }
 
     // LGBM_CUDA: moved for overlapping data copy w/ other operations
+//fprintf(stderr, "inside TrainOneIterCUDA CP107\n"); fflush(stderr);
     int iter_next = iter_ + 1;
       if (iter_next < config_->num_iterations) {
 
 //       auto start_time = std::chrono::steady_clock::now();
 
        // bagging logic
+//fprintf(stderr, "inside TrainOneIterCUDA CP108\n"); fflush(stderr);
        Bagging(iter_next);
 
       }
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index a020f425f3a..b796e1bc2cb 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -801,8 +801,10 @@ void Dataset::CopySubrow(const Dataset* fullset,
                          const data_size_t* used_indices,
                          data_size_t num_used_indices, bool need_meta_data) {
   CHECK_EQ(num_used_indices, num_data_);
+fprintf(stderr, "CopySubrow CP3, used_indices[5503] = %4d\n", (int) used_indices[5503]); fflush(stderr); 
   OMP_INIT_EX();
 #pragma omp parallel for schedule(static)
+
   for (int group = 0; group < num_groups_; ++group) {
     OMP_LOOP_EX_BEGIN();
     feature_groups_[group]->CopySubrow(fullset->feature_groups_[group].get(),
@@ -1310,10 +1312,18 @@ void Dataset::ConstructHistogramsInner(
     data_size_t num_data, const score_t* gradients, const score_t* hessians,
     score_t* ordered_gradients, score_t* ordered_hessians,
     TrainingShareStates* share_state, hist_t* hist_data) const {
+
+fprintf(stderr, "CPU ");
+if (!USE_INDICES) fprintf(stderr, "IGNORE_INDICES ");
+if (!USE_HESSIAN) fprintf(stderr, "CONST_HESSIAN ");
+fprintf(stderr, "\n"); fflush(stderr);
+//fprintf(stderr, "gradients[2161] = %lf\n", gradients[2161]); fflush(stderr);
+
   if (!share_state->is_colwise) {
     return ConstructHistogramsMultiVal<USE_INDICES, false>(
         data_indices, num_data, gradients, hessians, share_state, hist_data);
   }
+
   std::vector<int> used_dense_group;
   int multi_val_groud_id = -1;
   used_dense_group.reserve(num_groups_);
@@ -1358,12 +1368,20 @@ void Dataset::ConstructHistogramsInner(
       }
     }
     OMP_INIT_EX();
+if (USE_INDICES) {
+   //fprintf(stderr, "   data_indices = %3d %3d %3d %3d %3d %3d %3d %3d\n", data_indices[0], data_indices[1], data_indices[2], data_indices[3], data_indices[4], data_indices[5], data_indices[6], data_indices[7]); fflush(stderr);
+   //fprintf(stderr, "   gradients = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ptr_ordered_grad[0],  ptr_ordered_grad[1],  ptr_ordered_grad[2],  ptr_ordered_grad[3],  ptr_ordered_grad[4],  ptr_ordered_grad[5],  ptr_ordered_grad[6],  ptr_ordered_grad[7]); fflush(stderr); 
+   //fprintf(stderr, "   hessians = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ptr_ordered_hess[0],  ptr_ordered_hess[1],  ptr_ordered_hess[2],  ptr_ordered_hess[3],  ptr_ordered_hess[4],  ptr_ordered_hess[5],  ptr_ordered_hess[6],  ptr_ordered_hess[7]); fflush(stderr); 
+//fprintf(stderr, "   offset into return array for gi = 0: %d\n", (int) group_bin_boundaries_[used_dense_group[0]]); fflush(stderr);
+}
+
 #pragma omp parallel for schedule(static) num_threads(share_state->num_threads)
     for (int gi = 0; gi < num_used_dense_group; ++gi) {
       OMP_LOOP_EX_BEGIN();
       int group = used_dense_group[gi];
       auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
       const int num_bin = feature_groups_[group]->num_total_bin_;
+//fprintf(stderr, "gi = %2d, group_bin_boundaries_[%2d] = %4d, num_bin = %d\n", gi, (int) group, (int) group_bin_boundaries_[group], (int) num_bin);
       std::memset(reinterpret_cast<void*>(data_ptr), 0,
                   num_bin * kHistEntrySize);
       if (USE_HESSIAN) {
@@ -1372,8 +1390,16 @@ void Dataset::ConstructHistogramsInner(
               data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
               data_ptr);
         } else {
-          feature_groups_[group]->bin_data_->ConstructHistogram(
-              0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+          if (gi == 0) {
+//fprintf(stderr, "   calling core ConstructHistogramDebug\n"); fflush(stderr);
+             feature_groups_[group]->bin_data_->ConstructHistogramDebug(
+                 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+//fprintf(stderr, "   back from ConstructHistogramDebug, hist_data = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", hist_data[0], hist_data[1], hist_data[2], hist_data[3], hist_data[4], hist_data[5]); fflush(stderr);
+          }
+          else {
+             feature_groups_[group]->bin_data_->ConstructHistogram(
+                 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+          }
         }
       } else {
         if (USE_INDICES) {
@@ -1392,6 +1418,7 @@ void Dataset::ConstructHistogramsInner(
     }
     OMP_THROW_EX();
   }
+//fprintf(stderr, "   leaving 'CPU kernel' hist_data = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", hist_data[0], hist_data[1], hist_data[2], hist_data[3], hist_data[4], hist_data[5]); fflush(stderr);
   global_timer.Stop("Dataset::dense_bin_histogram");
   if (multi_val_groud_id >= 0) {
     if (num_used_dense_group > 0) {
@@ -1439,6 +1466,7 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient,
   const BinMapper* bin_mapper =
       feature_groups_[group]->bin_mappers_[sub_feature].get();
   const int most_freq_bin = bin_mapper->GetMostFreqBin();
+//fprintf(stderr, "in Dataset::FixHistogram, feature_idx = %2d, group = %2d, sub_feature = %d, most_freq_bin = %3d\n", feature_idx, group, sub_feature, most_freq_bin); fflush(stderr);
   if (most_freq_bin > 0) {
     const int num_bin = bin_mapper->num_bin();
     GET_GRAD(data, most_freq_bin) = sum_gradient;
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 99feadf9f7f..803e85a6dab 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -100,6 +100,32 @@ class DenseBin : public Bin {
   BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin,
                            uint32_t most_freq_bin) const override;
 
+  template <bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
+  void ConstructHistogramInnerDebug(data_size_t start, data_size_t end,
+                               const score_t* ordered_gradients,
+                               const score_t* ordered_hessians,
+                               hist_t* out) const {
+    data_size_t i = start;
+    hist_t* grad = out;
+    hist_t* hess = out + 1;
+    hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(hess);
+//fprintf(stderr, "      inside ConstructHistogramInnerDebug, i = %d\n", i); fflush(stderr);
+//fprintf(stderr, "      DEBUG: data(5503) = %d\n", data(5503));
+    for (; i < end; ++i) {
+      const auto idx = i;
+      const auto ti = static_cast<uint32_t>(data(idx)) << 1;
+//if (ti == 2) fprintf(stderr, "      data(%4d) = %4d, adding %7.4lf\n", idx, data(idx), ordered_gradients[i]); fflush(stderr);
+      if (USE_HESSIAN) {
+        grad[ti] += ordered_gradients[i];
+        hess[ti] += ordered_hessians[i];
+      } else {
+        grad[ti] += ordered_gradients[i];
+        ++cnt[ti];
+      }
+    }
+//fprintf(stderr, "      leaving ConstructHistogramInnerDebug, out[2/3] = %7.4lf %7.4lf\n", out[2], out[3]);
+  }
+
   template <bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
   void ConstructHistogramInner(const data_size_t* data_indices,
                                data_size_t start, data_size_t end,
@@ -145,6 +171,16 @@ class DenseBin : public Bin {
     }
   }
 
+  void ConstructHistogramDebug(data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          const score_t* ordered_hessians,
+                          hist_t* out) const {
+//fprintf(stderr, "      calling ConstructHistogramInnerDebug\n"); fflush(stderr);
+    ConstructHistogramInnerDebug<true, true, true>(
+        start, end, ordered_gradients, ordered_hessians, out);
+//fprintf(stderr, "      back from ConstructHistogramInnerDebug\n"); fflush(stderr);
+  }
+
   void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
                           data_size_t end, const score_t* ordered_gradients,
                           const score_t* ordered_hessians,
@@ -388,6 +424,7 @@ class DenseBin : public Bin {
       const void* memory,
       const std::vector<data_size_t>& local_used_indices) override {
     const VAL_T* mem_data = reinterpret_cast<const VAL_T*>(memory);
+//fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_data_ = %d\n", (int) num_data_);
     if (!local_used_indices.empty()) {
       if (IS_4BIT) {
         const data_size_t rest = num_data_ & 1;
@@ -431,6 +468,7 @@ class DenseBin : public Bin {
   void CopySubrow(const Bin* full_bin, const data_size_t* used_indices,
                   data_size_t num_used_indices) override {
     auto other_bin = dynamic_cast<const DenseBin<VAL_T, IS_4BIT>*>(full_bin);
+//fprintf(stderr, "inside CopySubrow (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_used_indices = %d\n", (int) num_used_indices);
     if (IS_4BIT) {
       const data_size_t rest = num_used_indices & 1;
       for (int i = 0; i < num_used_indices - rest; i += 2) {
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index c56cd6da99d..74cdb08c82b 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -98,6 +98,37 @@ class SparseBin : public Bin {
   hist[ti] += g;                            \
   hist[ti + 1] += h;
 
+  void ConstructHistogramDebug(data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          const score_t* ordered_hessians,
+                          hist_t* out) const {
+    data_size_t i_delta, cur_pos;
+    InitIndex(start, &i_delta, &cur_pos);
+    data_size_t i = start;
+    for (;;) {
+      if (cur_pos < i) {
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) {
+          break;
+        }
+      } else if (cur_pos > i) {
+        if (++i >= end) {
+          break;
+        }
+      } else {
+        const VAL_T bin = vals_[i_delta];
+        ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+        if (++i >= end) {
+          break;
+        }
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) {
+          break;
+        }
+      }
+    }
+  }
+
   void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
                           data_size_t end, const score_t* ordered_gradients,
                           const score_t* ordered_hessians,
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 3ea95268032..d59c60c3957 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -919,12 +919,14 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; 
 
   // Check workgroups per feature4 tuple..
-  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
+// GCF Let's try this!!!
+//  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
 
   // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU
-  if (exp_workgroups_per_feature == 0){
-    return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
-  }
+// GCF Let's try this!!!
+//  if (exp_workgroups_per_feature == 0){
+//    return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
+//  }
 
   // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
   bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index a85918cc3c6..42d7c0d4d01 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -208,9 +208,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      score_t grad, hess;
      score_t grad_next, hess_next;
      // LGBM_CUDA v5.1
-     grad = ordered_gradients[subglobal_tid];
+     grad = ordered_gradients[ind];
      #if CONST_HESSIAN == 0
-     hess = ordered_hessians[subglobal_tid];
+     hess = ordered_hessians[ind];
      #endif
 
 
@@ -227,9 +227,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          #endif
 
          // imbGBT v5.1
-         grad_next = ordered_gradients[i_next];
+         grad_next = ordered_gradients[ind_next];
          #if CONST_HESSIAN == 0
-         hess_next = ordered_hessians[i_next];
+         hess_next = ordered_hessians[ind_next];
          #endif
 
          // STAGE 2: accumulate gradient and hessian
@@ -562,9 +562,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      score_t grad, hess;
      score_t grad_next, hess_next;
      // LGBM_CUDA v5.1
-     grad = ordered_gradients[subglobal_tid];
+     grad = ordered_gradients[ind];
      #if CONST_HESSIAN == 0
-     hess = ordered_hessians[subglobal_tid];
+     hess = ordered_hessians[ind];
      #endif
 
 
@@ -581,9 +581,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          #endif
 
          // imbGBT v5.1
-         grad_next = ordered_gradients[i_next];
+         grad_next = ordered_gradients[ind_next];
          #if CONST_HESSIAN == 0
-         hess_next = ordered_hessians[i_next];
+         hess_next = ordered_hessians[ind_next];
          #endif
 
          // STAGE 2: accumulate gradient and hessian
@@ -759,6 +759,9 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
     const ushort ltid = threadIdx.x;
+//#ifdef IGNORE_INDICES
+//    const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
+//#endif
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
@@ -772,6 +775,10 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     }
     ushort i;
 
+//#ifdef IGNORE_INDICES
+//if (gtid == 1) printf("   skip_id = %d, grad_bin = %7.4lf\n", skip_id, grad_bin);
+//#endif
+
     if (power_feature_workgroups != 0) {
         // add all sub-histograms for feature
         const acc_type* __restrict__ p = feature_sub_hist + ltid;
@@ -785,6 +792,9 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         p += 3 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
+//#ifdef IGNORE_INDICES
+//if (gtid == 1) printf("   adding %7.4lf\n", *p);
+//#endif
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
             cont_bin += as_acc_int_type(*p); p += NUM_BINS;
@@ -798,6 +808,13 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
 #else
     output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 #endif
+
+//#ifdef IGNORE_INDICES
+//__syncthreads();
+//if (gtid == 1) printf("KERNEL returning %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", output_buf[0], output_buf[1], output_buf[2], output_buf[3], output_buf[4], output_buf[5]);
+//__syncthreads();
+//#endif
+
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -842,6 +859,42 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
+if (gtid == 5503) {
+#if USE_CONSTANT_BUF == 1
+#ifdef IGNORE_INDICES
+#if CONST_HESSIAN == 0
+printf("KERNEL USE_CONSTANT_BUF IGNORE_INDICES\n"); 
+#else
+printf("KERNEL USE_CONSTANT_BUF IGNORE_INDICES CONST_HESSIAN\n"); 
+#endif
+#else
+#if CONST_HESSIAN == 0
+printf("KERNEL USE_CONSTANT_BUF \n"); 
+#else
+printf("KERNEL USE_CONSTANT_BUF CONST_HESSIAN\n"); 
+#endif
+#endif
+#else
+#ifdef IGNORE_INDICES
+#if CONST_HESSIAN == 0
+printf("KERNEL IGNORE_INDICES (exp = %d)\n", (int) power_feature_workgroups); 
+#else
+printf("KERNEL IGNORE_INDICES CONST_HESSIAN\n"); 
+#endif
+#else
+#if CONST_HESSIAN == 0
+printf("KERNEL (exp = %d)\n", (int) power_feature_workgroups); 
+//for (int i=0; i<5000; ++i) if (feature_data_base[i] == 1) printf("found '1' in feature_data_base array, at index %d\n", i);
+//printf("   data_indices = %3d %3d %3d %3d %3d %3d %3d %3d\n", (int) data_indices[0], (int) data_indices[1], (int) data_indices[2], (int) data_indices[3], (int) data_indices[4], (int) data_indices[5], (int) data_indices[6], (int) data_indices[7]);
+//printf("   gradients = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ordered_gradients[data_indices[0]], ordered_gradients[data_indices[1]], ordered_gradients[data_indices[2]], ordered_gradients[data_indices[3]], ordered_gradients[data_indices[4]], ordered_gradients[data_indices[5]], ordered_gradients[data_indices[6]], ordered_gradients[data_indices[7]]);
+//printf("   hessians = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ordered_hessians[data_indices[0]], ordered_hessians[data_indices[1]], ordered_hessians[data_indices[2]], ordered_hessians[data_indices[3]], ordered_hessians[data_indices[4]], ordered_hessians[data_indices[5]], ordered_hessians[data_indices[6]], ordered_hessians[data_indices[7]]);
+#else
+printf("KERNEL CONST_HESSIAN\n"); 
+#endif
+#endif
+#endif
+}
+
      // local memory per workgroup is 3 KB
      // clear local memory
      uint *ptr = (uint *) shared_array;
@@ -870,6 +923,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
      // feature_size is the number of examples per feature
      const uchar *feature_data = feature_data_base + feature_id * feature_size;
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503) printf("feature_id = %d, feature_size = %d\n", feature_id, feature_size);
+//#endif
 
      // size of threads that process this feature4
      const uint subglobal_size = lsize * (1 << power_feature_workgroups);
@@ -877,11 +933,16 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // equavalent thread ID in this subgroup for this feature4
      const uint subglobal_tid  = gtid - feature_id * subglobal_size;
 
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503) for (int i=0; i<5600; ++i) if(feature_data_base[i] == 1) printf("found 1 at %d in feature_data_BASE\n", i); 
+//if (gtid == 5503) for (int i=0; i<5600; ++i) if(feature_data[i] == 1) printf("found 1 at %d in feature_data\n", i); 
+//#endif
 
      data_size_t ind;
      data_size_t ind_next;
      #ifdef IGNORE_INDICES
      ind = subglobal_tid;
+//if (gtid == 5503) printf("gtid = %d (0x%08x), subglobal_tid = %d (0x%08x), ind = %d (0x%08x)\n", gtid, gtid, subglobal_tid, subglobal_tid, ind, ind);
      #else
      ind = data_indices[subglobal_tid];
      #endif
@@ -904,6 +965,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      ushort bin;
 
      feature = feature_data[ind >> feature_mask];
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503) printf("gtid = %d (0x%08x), ind = %d (0x%08x), feature_mask = %d,       feature = %d\n", gtid, gtid, ind, ind, feature_mask, feature_data[ind >> feature_mask]);
+//if (gtid == 5503) printf("gtid = %d (0x%08x), ind = %d (0x%08x), feature_mask = %d, BASE  feature = %d\n", gtid, gtid, ind, ind, feature_mask, feature_data_base[ind >> feature_mask]);
+//#endif
      if (feature_mask) {
         feature = (feature >> ((ind & 1) << 2)) & 0xf;
      }
@@ -915,9 +980,12 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      score_t grad, hess;
      score_t grad_next, hess_next;
      // LGBM_CUDA v5.1
-     grad = ordered_gradients[subglobal_tid];
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503) printf("gtid = %d (0x%08x), gradient by 'i': %lf, gradient by 'subglobal_tid': %lf\n", gtid, gtid, ordered_gradients[gtid], ordered_gradients[subglobal_tid]);
+//#endif
+     grad = ordered_gradients[ind];
      #if CONST_HESSIAN == 0
-     hess = ordered_hessians[subglobal_tid];
+     hess = ordered_hessians[ind];
      #endif
 
 
@@ -929,20 +997,27 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
          #ifdef IGNORE_INDICES
          // we need to check to bounds here
          ind_next = i_next < num_data ? i_next : i;
+//if (gtid == 5503) printf("gtid = %d (0x%08x), i = %d (0x%08x), i_next = %d (0x%08x), ind_next = %d (0x%08x)\n", gtid, gtid, i, i, i_next, i_next, ind_next, ind_next);
          #else
          ind_next = data_indices[i_next];
          #endif
 
          // imbGBT v5.1
-         grad_next = ordered_gradients[i_next];
+         grad_next = ordered_gradients[ind_next];
          #if CONST_HESSIAN == 0
-         hess_next = ordered_hessians[i_next];
+         hess_next = ordered_hessians[ind_next];
          #endif
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503) printf("gtid = %d (0x%08x), i = %d (0x%08x), i_next = %d (0x%08x), grad_next = %lf\n", gtid, gtid, i, i, i_next, i_next, grad_next);
+//#endif
 
          // STAGE 2: accumulate gradient and hessian
          if (bin != feature) {
              addr_bin = gh_hist + bin * 2 + is_hessian_first;
              #if CONST_HESSIAN == 0
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), prepping to accumulate: is_hessian_first = %d\n", gtid, gtid, is_hessian_first);
+//#endif
              acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
              atomic_local_add_f(addr_bin, acc_bin);
 
@@ -955,39 +1030,71 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
              #endif
 
              bin = feature;
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), setting bin = feature 1, grad = %lf, grad_bin = %lf\n", gtid, gtid, grad, grad_bin);
+//#endif
              grad_bin = grad;
              hess_bin = hess;
          }
          else {
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x),                          grad = %lf, grad_bin = %lf\n", gtid, gtid, grad, grad_bin);
+//#endif
              grad_bin += grad;
              hess_bin += hess;
          }
 
          // prefetch the next iteration variables
          feature_next = feature_data[ind_next >> feature_mask];
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503) printf("gtid = %d (0x%08x), ind_next = %d (0x%08x), feature_mask = %d,       feature = %d\n", gtid, gtid, ind_next, ind_next, feature_mask, feature_data[ind_next >> feature_mask]);
+//if (gtid == 5503) printf("gtid = %d (0x%08x), ind_next = %d (0x%08x), feature_mask = %d, BASE  feature = %d\n", gtid, gtid, ind_next, ind_next, feature_mask, feature_data_base[ind_next >> feature_mask]);
+//#endif
 
          // STAGE 3: accumulate counter
          atomicAdd(cnt_hist + feature, 1);
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503 && feature == 1) printf("gtid = %d (0x%08x), adding feature 1 to cnt_hist!\n", gtid, gtid);
+//#endif
 
          // STAGE 4: update next stat
          grad = grad_next;
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503 && feature == 1) printf("gtid = %d (0x%08x), moved grad_next to grad = %lf\n", gtid, gtid, grad);
+//#endif
          hess = hess_next;
          // LGBM_CUDA: v4.2
          if (!feature_mask) {
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503 && feature_next == 1) printf("gtid = %d (0x%08x), moving feature_next 1 into feature 1!\n", gtid, gtid);
+//#endif
              feature = feature_next;
          } else {
              feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
          }
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503) printf("gtid = %d (0x%08x), at end of loop, i = %d, num_data = %d, subglobal_size = %d, feature = %d\n", gtid, gtid, i, num_data, subglobal_size, feature);
+//#endif
      }
 
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), grad = %lf\n", gtid, gtid, grad);
+//#endif
 
      addr_bin = gh_hist + bin * 2 + is_hessian_first;
      #if CONST_HESSIAN == 0
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), prepping to accumulate: is_hessian_first = %d\n", gtid, gtid, is_hessian_first);
+//#endif
      acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
      atomic_local_add_f(addr_bin, acc_bin);
 
      addr_bin = addr_bin + 1 - 2 * is_hessian_first;
      acc_bin = is_hessian_first? grad_bin : hess_bin;
+
+//#ifdef IGNORE_INDICES
+//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), adding %lf to offset %d\n", gtid, gtid, acc_bin, (int) (addr_bin - gh_hist));
+//#endif
      atomic_local_add_f(addr_bin, acc_bin);
 
      #elif CONST_HESSIAN == 1
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index e5d2a64ceaf..ae7bf52ce30 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -93,6 +93,7 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset,
 void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data,
                                                bool is_constant_hessian,
                                                bool reset_multi_val_bin) {
+//fprintf(stderr, "inside SerialTreeLearner::ResetTrainingDataInner\n"); fflush(stderr);
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   CHECK_EQ(num_features_, train_data_->num_features());
@@ -152,6 +153,9 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
 
 Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) {
   Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
+
+fprintf(stderr, "in SerialTreeLearner::Train\n"); fflush(stderr);
+fprintf(stderr, "first few gradients: %lf %lf %lf %lf\n", (double) gradients[0], (double) gradients[1], (double) gradients[2], (double) gradients[3]);
   gradients_ = gradients;
   hessians_ = hessians;
   is_constant_hessian_ = is_constant_hessian;
@@ -181,10 +185,12 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
   int init_splits = 0;
   bool aborted_last_force_split = false;
   if (!forced_split_json.is_null()) {
+//fprintf(stderr, "we're calling ForceSplits\n"); fflush(stderr);
     init_splits = ForceSplits(tree.get(), forced_split_json, &left_leaf,
                               &right_leaf, &cur_depth, &aborted_last_force_split);
   }
 
+//fprintf(stderr, "loop start value = %d, loop end value = %d\n", init_splits, config_->num_leaves - 1); fflush(stderr);
   for (int split = init_splits; split < config_->num_leaves - 1; ++split) {
     // some initial works before finding best split
     if (BeforeFindBestSplit(tree_prt, left_leaf, right_leaf)) {
@@ -201,10 +207,27 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
       break;
     }
     // split tree with best leaf
+
+//fprintf(stderr, "%3d ", best_split_per_leaf_[0].feature);
+//fprintf(stderr, "%3d ", best_split_per_leaf_[0].threshold);
+//fprintf(stderr, "%3d ", best_split_per_leaf_[0].left_count);
+//fprintf(stderr, "%3d ", best_split_per_leaf_[0].right_count);
+//fprintf(stderr, "%3d ", best_split_per_leaf_[0].num_cat_threshold);
+//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_output);
+//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_output);
+//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].gain);
+//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_sum_gradient);
+//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_sum_hessian);
+//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_sum_gradient);
+//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_sum_hessian);
+//fprintf(stderr, "\n"); 
+
+//fprintf(stderr, "Calling Split, best_leaf = %d\n", best_leaf);
     Split(tree_prt, best_leaf, &left_leaf, &right_leaf);
     cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
   }
   Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth);
+fprintf(stderr, "Leaving SerialTreeLearner::Train\n"); fflush(stderr);
   return tree.release();
 }
 
@@ -322,6 +345,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int
 
 void SerialTreeLearner::FindBestSplits(const Tree* tree) {
   std::vector<int8_t> is_feature_used(num_features_, 0);
+//fprintf(stderr, "in FindBestSplits, num_features_ = %d\n", num_features_); fflush(stderr);
   #pragma omp parallel for schedule(static, 256) if (num_features_ >= 512)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
     if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
@@ -333,16 +357,23 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) {
     is_feature_used[feature_index] = 1;
   }
   bool use_subtract = parent_leaf_histogram_array_ != nullptr;
+//for (int i=0; i<num_features_; ++i) fprintf(stderr, "%d ", is_feature_used[i]);
+//fprintf(stderr, "\n"); 
+//fprintf(stderr, "use_subtract = %d\n", use_subtract); 
+fprintf(stderr, "at the fork between CPU and CUDA\n");
 
 #ifdef USE_CUDA
   if (LGBM_config_::current_learner == use_cpu_learner){
       Log::Info("LightGBM-CUDA using CPU ConstructHistograms()");
+fprintf(stderr, "calling CPU ConstructHistograms\n"); 
       SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); 
   }
   else{
+fprintf(stderr, "calling CUDA ConstructHistograms\n"); 
       ConstructHistograms(is_feature_used, use_subtract);
   }
 #else
+fprintf(stderr, "calling CPU ConstructHistograms\n"); 
   ConstructHistograms(is_feature_used, use_subtract);
 #endif
   FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
@@ -354,20 +385,24 @@ void SerialTreeLearner::ConstructHistograms(
                                   global_timer);
   // construct smaller leaf
   hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
+//fprintf(stderr, "calling train_data_->ConstructHistograms(smaller)\n"); fflush(stderr);
   train_data_->ConstructHistograms(
       is_feature_used, smaller_leaf_splits_->data_indices(),
       smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
       ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
       ptr_smaller_leaf_hist_data);
+//fprintf(stderr, "back from train_data_->ConstructHistograms(smaller)\n"); fflush(stderr);
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
     // construct larger leaf
     hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
+//fprintf(stderr, "calling train_data_->ConstructHistograms(larger)\n"); fflush(stderr);
     train_data_->ConstructHistograms(
         is_feature_used, larger_leaf_splits_->data_indices(),
         larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
         ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
         ptr_larger_leaf_hist_data);
+//fprintf(stderr, "back from train_data_->ConstructHistograms(larger)\n"); fflush(stderr);
   }
 }
 
@@ -375,6 +410,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
     const std::vector<int8_t>& is_feature_used, bool use_subtract, const Tree* tree) {
   Common::FunctionTimer fun_timer(
       "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
+fflush(stdout);
+fflush(stderr);
+//fprintf(stderr, "inside FindBestSplitsFromHistograms, num_threads = %d\n", (int) share_state_->num_threads); fflush(stderr);
   std::vector<SplitInfo> smaller_best(share_state_->num_threads);
   std::vector<SplitInfo> larger_best(share_state_->num_threads);
   std::vector<int8_t> smaller_node_used_features = col_sampler_.GetByNode(tree, smaller_leaf_splits_->leaf_index());
@@ -438,6 +476,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
     auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
     best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
   }
+//fprintf(stderr, "leaving FindBestSplitsFromHistograms\n"); fflush(stderr);
 }
 
 int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
@@ -559,7 +598,9 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int*
 void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
                                    int* right_leaf, bool update_cnt) {
   Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer);
+
   SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
+
   const int inner_feature_index =
       train_data_->InnerFeatureIndex(best_split_info.feature);
   if (cegb_ != nullptr) {
@@ -641,6 +682,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
   CHECK(*right_leaf == next_leaf_id);
 #endif
 
+fprintf(stderr, "arrived at the assert, leaves = %d %d, sum = %d\n", best_split_info.left_count, best_split_info.right_count, best_split_info.left_count + best_split_info.right_count); fflush(stderr);
   // init the leaves that used on next iteration
   if (best_split_info.left_count < best_split_info.right_count) {
     CHECK_GT(best_split_info.left_count, 0);

From 1aabb5c989d03544cdd6f3dc3f93dcd449efa4eb Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Sun, 24 May 2020 14:30:53 +0000
Subject: [PATCH 046/119] Initial CUDA work

---
 src/boosting/gbdt.cpp | 24 ++++++++++++++----------
 src/boosting/rf.hpp   |  2 ++
 src/io/dense_bin.hpp  |  4 ++--
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index f02c5d940f1..821b1b46411 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -243,10 +243,10 @@ data_size_t GBDT::BalancedBaggingHelper(data_size_t start, data_size_t cnt,
 void GBDT::Bagging(int iter) {
   Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
   // if need bagging
-fprintf(stderr, "inside GBDT::Bagging!\n"); fflush(stderr);
+fprintf(stderr, "inside GBDT::Bagging, iter = %d\n", iter); fflush(stderr);
   if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) ||
       need_re_bagging_) {
-//fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr);
+fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr);
     need_re_bagging_ = false;
     auto left_cnt = bagging_runner_.Run<true>(
         num_data_,
@@ -265,9 +265,9 @@ fprintf(stderr, "inside GBDT::Bagging!\n"); fflush(stderr);
     bag_data_cnt_ = left_cnt;
     Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
     // set bagging data to tree learner
-//fprintf(stderr, "inside GBDT::Bagging, past second hurdle\n"); fflush(stderr);
+fprintf(stderr, "inside GBDT::Bagging, past second hurdle\n"); fflush(stderr);
     if (!is_use_subset_) {
-//fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr);
+fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr);
       tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
     } else { // LGBM_CUDA
       // NEW get subset
@@ -279,7 +279,7 @@ fprintf(stderr, "inside GBDT::Bagging!\n"); fflush(stderr);
         tmp_hessians_.resize(total_size);
       }
 
-//fprintf(stderr, "CopySubrow CP2, bag_data_cnt_ = %d\n", bag_data_cnt_); fflush(stderr);
+fprintf(stderr, "CopySubrow CP2, bag_data_cnt_ = %d\n", bag_data_cnt_); fflush(stderr);
 //char *temp_bag = (char *) bag_data_indices_.data();
 //for (int i=0; i<bag_data_cnt_; i+=1000) {
 //fprintf(stderr, "bag_data[%6d] = %d\n", i, (int) temp_bag[i]); 
@@ -396,15 +396,16 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
 // LGBM_CUDA
 bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
 
-//fprintf(stderr, "inside TrainOneIterCUDA CP103\n"); fflush(stderr);
+fprintf(stderr, "inside TrainOneIterCUDA, iter_ = %d\n", iter_); fflush(stderr);
 
  // LGBM_CUDA invoke baggging during the first iteration
- if ((config_->device_type == std::string("cuda")) && (iter_ == 0)) {
+ if (config_->device_type == std::string("cuda")) {
 
 //    auto start_time = std::chrono::steady_clock::now();
 
-//fprintf(stderr, "calling Bagging CP104\n"); fflush(stderr);
-    Bagging(0); 
+fprintf(stderr, "calling Bagging CP104\n"); fflush(stderr);
+    Bagging(iter_); 
+fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr);
   }
 
   std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
@@ -508,8 +509,9 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
 //       auto start_time = std::chrono::steady_clock::now();
 
        // bagging logic
-//fprintf(stderr, "inside TrainOneIterCUDA CP108\n"); fflush(stderr);
+fprintf(stderr, "calling Bagging CP105\n"); fflush(stderr);
        Bagging(iter_next);
+fprintf(stderr, "back from Bagging CP105\n"); fflush(stderr);
 
       }
     }
@@ -549,7 +551,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
     hessians = hessians_.data();
   }
   // bagging logic
+fprintf(stderr, "calling Bagging CP106\n"); fflush(stderr);
   Bagging(iter_);
+fprintf(stderr, "back from Bagging CP106\n"); fflush(stderr);
 
   bool should_continue = false;
   for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp
index e64bf6cb4d8..6912e0757d6 100644
--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -102,7 +102,9 @@ class RF : public GBDT {
 
   bool TrainOneIter(const score_t* gradients, const score_t* hessians) override {
     // bagging logic
+fprintf(stderr, "calling Bagging in TrainOneIter\n"); fflush(stderr);
     Bagging(iter_);
+fprintf(stderr, "back from Bagging in TrainOneIter\n"); fflush(stderr);
     CHECK_EQ(gradients, nullptr);
     CHECK_EQ(hessians, nullptr);
 
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 803e85a6dab..c1e30c44bc4 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -424,7 +424,7 @@ class DenseBin : public Bin {
       const void* memory,
       const std::vector<data_size_t>& local_used_indices) override {
     const VAL_T* mem_data = reinterpret_cast<const VAL_T*>(memory);
-//fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_data_ = %d\n", (int) num_data_);
+fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_data_ = %d\n", (int) num_data_);
     if (!local_used_indices.empty()) {
       if (IS_4BIT) {
         const data_size_t rest = num_data_ & 1;
@@ -468,7 +468,7 @@ class DenseBin : public Bin {
   void CopySubrow(const Bin* full_bin, const data_size_t* used_indices,
                   data_size_t num_used_indices) override {
     auto other_bin = dynamic_cast<const DenseBin<VAL_T, IS_4BIT>*>(full_bin);
-//fprintf(stderr, "inside CopySubrow (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_used_indices = %d\n", (int) num_used_indices);
+fprintf(stderr, "inside CopySubrow (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_used_indices = %d\n", (int) num_used_indices);
     if (IS_4BIT) {
       const data_size_t rest = num_used_indices & 1;
       for (int i = 0; i < num_used_indices - rest; i += 2) {

From f75696ee3205accc41974caad547f66d64448d38 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Thu, 28 May 2020 02:04:58 +0000
Subject: [PATCH 047/119] Initial CUDA work

---
 src/treelearner/kernels/histogram_16_64_256.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 42d7c0d4d01..4007a26ba47 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -21,7 +21,7 @@ if (b == gtid && t == ltid) { \
 }
 
 // atomic add for float number in local memory
-inline __device__ void atomic_local_add_f(acc_type *addr, const float val)
+inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val)
 {
     atomicAdd(addr, static_cast<acc_type>(val));
 }

From 0f6df0b7b992d825ccaad5e0d2bd4f6a2cd6b2ba Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Wed, 3 Jun 2020 03:41:05 +0000
Subject: [PATCH 048/119] Initial CUDA work

---
 include/LightGBM/feature_group.h              |  1 -
 src/boosting/gbdt.cpp                         | 28 +-----
 src/io/dataset.cpp                            | 24 +----
 src/io/dense_bin.hpp                          |  8 --
 src/treelearner/cuda_tree_learner.cpp         | 45 +++------
 src/treelearner/cuda_tree_learner.h           |  2 +-
 .../kernels/histogram_16_64_256.cu            | 93 ++-----------------
 src/treelearner/serial_tree_learner.cpp       | 47 ++--------
 tests/python_package_test/test_consistency.py |  3 +-
 9 files changed, 38 insertions(+), 213 deletions(-)

diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index d5eac42db48..d949beec20e 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -176,7 +176,6 @@ class FeatureGroup {
 
   inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
     if (!is_multi_val_) {
-//fprintf(stderr, "CopySubrow CP1A\n"); fflush(stderr);
       bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
     } else {
       for (int i = 0; i < num_feature_; ++i) {
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 821b1b46411..3f521ebf333 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -243,10 +243,8 @@ data_size_t GBDT::BalancedBaggingHelper(data_size_t start, data_size_t cnt,
 void GBDT::Bagging(int iter) {
   Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
   // if need bagging
-fprintf(stderr, "inside GBDT::Bagging, iter = %d\n", iter); fflush(stderr);
   if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) ||
       need_re_bagging_) {
-fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr);
     need_re_bagging_ = false;
     auto left_cnt = bagging_runner_.Run<true>(
         num_data_,
@@ -265,9 +263,7 @@ fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr);
     bag_data_cnt_ = left_cnt;
     Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
     // set bagging data to tree learner
-fprintf(stderr, "inside GBDT::Bagging, past second hurdle\n"); fflush(stderr);
     if (!is_use_subset_) {
-fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr);
       tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
     } else { // LGBM_CUDA
       // NEW get subset
@@ -279,21 +275,11 @@ fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr
         tmp_hessians_.resize(total_size);
       }
 
-fprintf(stderr, "CopySubrow CP2, bag_data_cnt_ = %d\n", bag_data_cnt_); fflush(stderr);
-//char *temp_bag = (char *) bag_data_indices_.data();
-//for (int i=0; i<bag_data_cnt_; i+=1000) {
-//fprintf(stderr, "bag_data[%6d] = %d\n", i, (int) temp_bag[i]); 
-//}
-//fflush(stderr);
-
       tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
 
-//fprintf(stderr, "CopySubrow CP2, calling tree_learner_->ResetTrainingData\n"); fflush(stderr);
       tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_);
-//fprintf(stderr, "CopySubrow CP2, back from tree_learner_->ResetTrainingData\n"); fflush(stderr);
     }
   }
-fprintf(stderr, "returning from GBDT::Bagging!\n"); fflush(stderr);
 }
 
 void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
@@ -396,16 +382,12 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
 // LGBM_CUDA
 bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
 
-fprintf(stderr, "inside TrainOneIterCUDA, iter_ = %d\n", iter_); fflush(stderr);
-
  // LGBM_CUDA invoke baggging during the first iteration
  if (config_->device_type == std::string("cuda")) {
 
 //    auto start_time = std::chrono::steady_clock::now();
 
-fprintf(stderr, "calling Bagging CP104\n"); fflush(stderr);
     Bagging(iter_); 
-fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr);
   }
 
   std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
@@ -425,9 +407,8 @@ fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr);
     hessians = hessians_.data();
   }
 
-//fprintf(stderr, "inside TrainOneIterCUDA CP105, bagging commented out\n"); fflush(stderr);
   // LGBM_CUDA  bagging logic
-  // Bagging(iter_); // GCF trial and error
+  // Bagging(iter_); 
 
   bool should_continue = false;
   for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
@@ -466,10 +447,8 @@ fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr);
       // LGBM_CUDA
       new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_));
     }
-//fprintf(stderr, "inside TrainOneIterCUDA, num_leaves = %d\n", new_tree->num_leaves()); fflush(stderr);
 
     if (new_tree->num_leaves() > 1) {
-//fprintf(stderr, "inside TrainOneIterCUDA CP106, this clause doesn't do bagging\n"); fflush(stderr);
       should_continue = true;
       auto score_ptr = train_score_updater_->score() + offset;
       auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast<double>(label[i]) - score_ptr[i]; };
@@ -502,16 +481,13 @@ fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr);
       }
 
     // LGBM_CUDA: moved for overlapping data copy w/ other operations
-//fprintf(stderr, "inside TrainOneIterCUDA CP107\n"); fflush(stderr);
     int iter_next = iter_ + 1;
       if (iter_next < config_->num_iterations) {
 
 //       auto start_time = std::chrono::steady_clock::now();
 
        // bagging logic
-fprintf(stderr, "calling Bagging CP105\n"); fflush(stderr);
        Bagging(iter_next);
-fprintf(stderr, "back from Bagging CP105\n"); fflush(stderr);
 
       }
     }
@@ -551,9 +527,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
     hessians = hessians_.data();
   }
   // bagging logic
-fprintf(stderr, "calling Bagging CP106\n"); fflush(stderr);
   Bagging(iter_);
-fprintf(stderr, "back from Bagging CP106\n"); fflush(stderr);
 
   bool should_continue = false;
   for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index b796e1bc2cb..fc71aeb43cd 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -801,7 +801,6 @@ void Dataset::CopySubrow(const Dataset* fullset,
                          const data_size_t* used_indices,
                          data_size_t num_used_indices, bool need_meta_data) {
   CHECK_EQ(num_used_indices, num_data_);
-fprintf(stderr, "CopySubrow CP3, used_indices[5503] = %4d\n", (int) used_indices[5503]); fflush(stderr); 
   OMP_INIT_EX();
 #pragma omp parallel for schedule(static)
 
@@ -1282,7 +1281,6 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
   }
   OMP_THROW_EX();
   global_timer.Stop("Dataset::sparse_bin_histogram");
-
   global_timer.Start("Dataset::sparse_bin_histogram_merge");
   int n_bin_block = 1;
   int bin_block_size = num_bin;
@@ -1313,16 +1311,12 @@ void Dataset::ConstructHistogramsInner(
     score_t* ordered_gradients, score_t* ordered_hessians,
     TrainingShareStates* share_state, hist_t* hist_data) const {
 
-fprintf(stderr, "CPU ");
-if (!USE_INDICES) fprintf(stderr, "IGNORE_INDICES ");
-if (!USE_HESSIAN) fprintf(stderr, "CONST_HESSIAN ");
-fprintf(stderr, "\n"); fflush(stderr);
-//fprintf(stderr, "gradients[2161] = %lf\n", gradients[2161]); fflush(stderr);
-
   if (!share_state->is_colwise) {
+fprintf(stderr, "CPU ('multival') hist_data = %p\n", hist_data); fflush(stderr);
     return ConstructHistogramsMultiVal<USE_INDICES, false>(
         data_indices, num_data, gradients, hessians, share_state, hist_data);
   }
+fprintf(stderr, "CPU (not 'multival')\n"); fflush(stderr);
 
   std::vector<int> used_dense_group;
   int multi_val_groud_id = -1;
@@ -1345,6 +1339,7 @@ fprintf(stderr, "\n"); fflush(stderr);
       }
     }
   }
+
   int num_used_dense_group = static_cast<int>(used_dense_group.size());
   global_timer.Start("Dataset::dense_bin_histogram");
   auto ptr_ordered_grad = gradients;
@@ -1367,21 +1362,14 @@ fprintf(stderr, "\n"); fflush(stderr);
         ptr_ordered_grad = ordered_gradients;
       }
     }
-    OMP_INIT_EX();
-if (USE_INDICES) {
-   //fprintf(stderr, "   data_indices = %3d %3d %3d %3d %3d %3d %3d %3d\n", data_indices[0], data_indices[1], data_indices[2], data_indices[3], data_indices[4], data_indices[5], data_indices[6], data_indices[7]); fflush(stderr);
-   //fprintf(stderr, "   gradients = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ptr_ordered_grad[0],  ptr_ordered_grad[1],  ptr_ordered_grad[2],  ptr_ordered_grad[3],  ptr_ordered_grad[4],  ptr_ordered_grad[5],  ptr_ordered_grad[6],  ptr_ordered_grad[7]); fflush(stderr); 
-   //fprintf(stderr, "   hessians = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ptr_ordered_hess[0],  ptr_ordered_hess[1],  ptr_ordered_hess[2],  ptr_ordered_hess[3],  ptr_ordered_hess[4],  ptr_ordered_hess[5],  ptr_ordered_hess[6],  ptr_ordered_hess[7]); fflush(stderr); 
-//fprintf(stderr, "   offset into return array for gi = 0: %d\n", (int) group_bin_boundaries_[used_dense_group[0]]); fflush(stderr);
-}
 
+    OMP_INIT_EX();
 #pragma omp parallel for schedule(static) num_threads(share_state->num_threads)
     for (int gi = 0; gi < num_used_dense_group; ++gi) {
       OMP_LOOP_EX_BEGIN();
       int group = used_dense_group[gi];
       auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
       const int num_bin = feature_groups_[group]->num_total_bin_;
-//fprintf(stderr, "gi = %2d, group_bin_boundaries_[%2d] = %4d, num_bin = %d\n", gi, (int) group, (int) group_bin_boundaries_[group], (int) num_bin);
       std::memset(reinterpret_cast<void*>(data_ptr), 0,
                   num_bin * kHistEntrySize);
       if (USE_HESSIAN) {
@@ -1391,10 +1379,8 @@ if (USE_INDICES) {
               data_ptr);
         } else {
           if (gi == 0) {
-//fprintf(stderr, "   calling core ConstructHistogramDebug\n"); fflush(stderr);
              feature_groups_[group]->bin_data_->ConstructHistogramDebug(
                  0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
-//fprintf(stderr, "   back from ConstructHistogramDebug, hist_data = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", hist_data[0], hist_data[1], hist_data[2], hist_data[3], hist_data[4], hist_data[5]); fflush(stderr);
           }
           else {
              feature_groups_[group]->bin_data_->ConstructHistogram(
@@ -1418,7 +1404,6 @@ if (USE_INDICES) {
     }
     OMP_THROW_EX();
   }
-//fprintf(stderr, "   leaving 'CPU kernel' hist_data = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", hist_data[0], hist_data[1], hist_data[2], hist_data[3], hist_data[4], hist_data[5]); fflush(stderr);
   global_timer.Stop("Dataset::dense_bin_histogram");
   if (multi_val_groud_id >= 0) {
     if (num_used_dense_group > 0) {
@@ -1466,7 +1451,6 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient,
   const BinMapper* bin_mapper =
       feature_groups_[group]->bin_mappers_[sub_feature].get();
   const int most_freq_bin = bin_mapper->GetMostFreqBin();
-//fprintf(stderr, "in Dataset::FixHistogram, feature_idx = %2d, group = %2d, sub_feature = %d, most_freq_bin = %3d\n", feature_idx, group, sub_feature, most_freq_bin); fflush(stderr);
   if (most_freq_bin > 0) {
     const int num_bin = bin_mapper->num_bin();
     GET_GRAD(data, most_freq_bin) = sum_gradient;
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index c1e30c44bc4..fc0fe8fbd57 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -109,12 +109,9 @@ class DenseBin : public Bin {
     hist_t* grad = out;
     hist_t* hess = out + 1;
     hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(hess);
-//fprintf(stderr, "      inside ConstructHistogramInnerDebug, i = %d\n", i); fflush(stderr);
-//fprintf(stderr, "      DEBUG: data(5503) = %d\n", data(5503));
     for (; i < end; ++i) {
       const auto idx = i;
       const auto ti = static_cast<uint32_t>(data(idx)) << 1;
-//if (ti == 2) fprintf(stderr, "      data(%4d) = %4d, adding %7.4lf\n", idx, data(idx), ordered_gradients[i]); fflush(stderr);
       if (USE_HESSIAN) {
         grad[ti] += ordered_gradients[i];
         hess[ti] += ordered_hessians[i];
@@ -123,7 +120,6 @@ class DenseBin : public Bin {
         ++cnt[ti];
       }
     }
-//fprintf(stderr, "      leaving ConstructHistogramInnerDebug, out[2/3] = %7.4lf %7.4lf\n", out[2], out[3]);
   }
 
   template <bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
@@ -175,10 +171,8 @@ class DenseBin : public Bin {
                           data_size_t end, const score_t* ordered_gradients,
                           const score_t* ordered_hessians,
                           hist_t* out) const {
-//fprintf(stderr, "      calling ConstructHistogramInnerDebug\n"); fflush(stderr);
     ConstructHistogramInnerDebug<true, true, true>(
         start, end, ordered_gradients, ordered_hessians, out);
-//fprintf(stderr, "      back from ConstructHistogramInnerDebug\n"); fflush(stderr);
   }
 
   void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
@@ -424,7 +418,6 @@ class DenseBin : public Bin {
       const void* memory,
       const std::vector<data_size_t>& local_used_indices) override {
     const VAL_T* mem_data = reinterpret_cast<const VAL_T*>(memory);
-fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_data_ = %d\n", (int) num_data_);
     if (!local_used_indices.empty()) {
       if (IS_4BIT) {
         const data_size_t rest = num_data_ & 1;
@@ -468,7 +461,6 @@ fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the F
   void CopySubrow(const Bin* full_bin, const data_size_t* used_indices,
                   data_size_t num_used_indices) override {
     auto other_bin = dynamic_cast<const DenseBin<VAL_T, IS_4BIT>*>(full_bin);
-fprintf(stderr, "inside CopySubrow (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_used_indices = %d\n", (int) num_used_indices);
     if (IS_4BIT) {
       const data_size_t rest = num_used_indices & 1;
       for (int i = 0; i < num_used_indices - rest; i += 2) {
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index d59c60c3957..b8eca14f9b3 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -76,8 +76,6 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian,
 
   // Initialize GPU buffers and kernels & LGBM_CUDA: get device info
   InitGPU(config_->num_gpu); // LGBM_CUDA
-
-
 }
 
 // some functions used for debugging the GPU histogram construction
@@ -238,18 +236,17 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
     size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
     size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
 
-    //CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
-    CUDASUCCESS_OR_FATAL(cudaMemcpy((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost));
+    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
     CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id]));
   }
 }
 
 
 template <typename HistType>
-void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) {
+void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array) {
   HistType* hist_outputs = (HistType*) host_histogram_outputs_;
 
-  //#pragma omp parallel for schedule(static, num_gpu_)
+  #pragma omp parallel for schedule(static, num_gpu_)
   for(int device_id = 0; device_id < num_gpu_; ++device_id) {
 
 //    auto start_time = std::chrono::steady_clock::now();
@@ -265,7 +262,8 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) {
       continue;
     }
     int dense_group_index = dense_feature_group_map_[i];
-    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
+    //auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
+    auto old_histogram_array = leaf_histogram_array[dense_group_index].RawData() - kHistOffset;
     int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); 
 
     for (int j = 0; j < bin_size; ++j) {
@@ -644,7 +642,7 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_
 
   int old_num_feature_groups = num_dense_feature_groups_;
   CountDenseFeatureGroups();
-  if ((old_num_data < num_data_) && (old_num_feature_groups < num_dense_feature_groups_)) {
+  if ((old_num_data < num_data_) || (old_num_feature_groups < num_dense_feature_groups_)) {
     prevAllocateGPUMemory();
     AllocateGPUMemory();
   } else {
@@ -863,22 +861,13 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
     return false;
   }
 
-#if GPU_DEBUG >= 1
-  printf("CudaTreeLearner::ConstructGPUHistogramsAsync() Feature masks: ");
-  for (unsigned int i = 0; i < feature_masks_.size(); ++i) {
-    printf("%d ", feature_masks_[i]);
-  }
-  printf("\n");
-  printf("CudaTreeLearner::ConstructGPUHistogramsAsync() %d feature groups, %d used, %d use_all_features\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features);
-#endif
-
   // if not all feature groups are used, we need to transfer the feature mask to GPU
   // otherwise, we will use a specialized GPU kernel with all feature groups enabled
   // LGBM_CUDA FIXME: No waiting mark for feature mask
 
   // LGBM_CUDA We now copy even if all features are used.
 
-    //#pragma omp parallel for schedule(static, num_gpu_)
+    #pragma omp parallel for schedule(static, num_gpu_)
     for(int device_id = 0; device_id < num_gpu_; ++device_id) {
       int offset = offset_gpu_feature_groups_[device_id];
       CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id]));
@@ -919,14 +908,12 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; 
 
   // Check workgroups per feature4 tuple..
-// GCF Let's try this!!!
-//  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
+  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
 
   // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU
-// GCF Let's try this!!!
-//  if (exp_workgroups_per_feature == 0){
-//    return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
-//  }
+  if (exp_workgroups_per_feature == 0){
+    return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
+  }
 
   // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
   bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
@@ -954,11 +941,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   if (is_gpu_used) {
     if (config_->gpu_use_dp) {
       // use double precision
-      WaitAndGetHistograms<hist_t>(ptr_smaller_leaf_hist_data);
+      WaitAndGetHistograms<hist_t>(smaller_leaf_histogram_array_);
     }
     else {
       // use single precision
-      WaitAndGetHistograms<gpu_hist_t>(ptr_smaller_leaf_hist_data);
+      WaitAndGetHistograms<gpu_hist_t>(smaller_leaf_histogram_array_);
     }
   }
 
@@ -1072,11 +1059,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     if (is_gpu_used) {
       if (config_->gpu_use_dp) {
         // use double precision
-        WaitAndGetHistograms<hist_t>(ptr_larger_leaf_hist_data);
+        WaitAndGetHistograms<hist_t>(larger_leaf_histogram_array_);
       }
       else {
         // use single precision
-        WaitAndGetHistograms<gpu_hist_t>(ptr_larger_leaf_hist_data);
+        WaitAndGetHistograms<gpu_hist_t>(larger_leaf_histogram_array_);
       }
     }
   }
@@ -1119,8 +1106,6 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ
         Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
       }
     } else {
-      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, best_split_info.right_output);
-      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, best_split_info.left_output);
       if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
           (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
         Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index a84d6b6662f..7b256345c82 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -153,7 +153,7 @@ class CUDATreeLearner: public SerialTreeLearner {
    * \param histograms Destination of histogram results from GPU.
   */
   template <typename HistType>
-  void WaitAndGetHistograms(hist_t* histograms);
+  void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array);
 
   /*!
    * \brief Construct GPU histogram asynchronously. 
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 4007a26ba47..991444fbf62 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -759,9 +759,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
     const ushort ltid = threadIdx.x;
-//#ifdef IGNORE_INDICES
-//    const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
-//#endif
+
     // TODO: try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
@@ -775,10 +773,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     }
     ushort i;
 
-//#ifdef IGNORE_INDICES
-//if (gtid == 1) printf("   skip_id = %d, grad_bin = %7.4lf\n", skip_id, grad_bin);
-//#endif
-
     if (power_feature_workgroups != 0) {
         // add all sub-histograms for feature
         const acc_type* __restrict__ p = feature_sub_hist + ltid;
@@ -792,14 +786,12 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         p += 3 * NUM_BINS;  
 
         for (i = i + 1; i < num_sub_hist; ++i) {
-//#ifdef IGNORE_INDICES
-//if (gtid == 1) printf("   adding %7.4lf\n", *p);
-//#endif
             grad_bin += *p;          p += NUM_BINS;
             hess_bin += *p;          p += NUM_BINS;
             cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
     }
+
     __syncthreads();
 
     output_buf[ltid * 2 + 0] = grad_bin;
@@ -808,13 +800,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
 #else
     output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
 #endif
-
-//#ifdef IGNORE_INDICES
-//__syncthreads();
-//if (gtid == 1) printf("KERNEL returning %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", output_buf[0], output_buf[1], output_buf[2], output_buf[3], output_buf[4], output_buf[5]);
-//__syncthreads();
-//#endif
-
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -859,7 +844,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
-if (gtid == 5503) {
+if (gtid == 2048) {
 #if USE_CONSTANT_BUF == 1
 #ifdef IGNORE_INDICES
 #if CONST_HESSIAN == 0
@@ -877,17 +862,13 @@ printf("KERNEL USE_CONSTANT_BUF CONST_HESSIAN\n");
 #else
 #ifdef IGNORE_INDICES
 #if CONST_HESSIAN == 0
-printf("KERNEL IGNORE_INDICES (exp = %d)\n", (int) power_feature_workgroups); 
+printf("KERNEL IGNORE_INDICES (exp = %d) (feature_size = %d)\n", (int) power_feature_workgroups, (int) feature_size); 
 #else
 printf("KERNEL IGNORE_INDICES CONST_HESSIAN\n"); 
 #endif
 #else
 #if CONST_HESSIAN == 0
-printf("KERNEL (exp = %d)\n", (int) power_feature_workgroups); 
-//for (int i=0; i<5000; ++i) if (feature_data_base[i] == 1) printf("found '1' in feature_data_base array, at index %d\n", i);
-//printf("   data_indices = %3d %3d %3d %3d %3d %3d %3d %3d\n", (int) data_indices[0], (int) data_indices[1], (int) data_indices[2], (int) data_indices[3], (int) data_indices[4], (int) data_indices[5], (int) data_indices[6], (int) data_indices[7]);
-//printf("   gradients = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ordered_gradients[data_indices[0]], ordered_gradients[data_indices[1]], ordered_gradients[data_indices[2]], ordered_gradients[data_indices[3]], ordered_gradients[data_indices[4]], ordered_gradients[data_indices[5]], ordered_gradients[data_indices[6]], ordered_gradients[data_indices[7]]);
-//printf("   hessians = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ordered_hessians[data_indices[0]], ordered_hessians[data_indices[1]], ordered_hessians[data_indices[2]], ordered_hessians[data_indices[3]], ordered_hessians[data_indices[4]], ordered_hessians[data_indices[5]], ordered_hessians[data_indices[6]], ordered_hessians[data_indices[7]]);
+printf("KERNEL (exp = %d) (feature_size = %d)\n", (int) power_feature_workgroups, (int) feature_size); 
 #else
 printf("KERNEL CONST_HESSIAN\n"); 
 #endif
@@ -923,9 +904,6 @@ printf("KERNEL CONST_HESSIAN\n");
      // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
      // feature_size is the number of examples per feature
      const uchar *feature_data = feature_data_base + feature_id * feature_size;
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503) printf("feature_id = %d, feature_size = %d\n", feature_id, feature_size);
-//#endif
 
      // size of threads that process this feature4
      const uint subglobal_size = lsize * (1 << power_feature_workgroups);
@@ -933,16 +911,10 @@ printf("KERNEL CONST_HESSIAN\n");
      // equavalent thread ID in this subgroup for this feature4
      const uint subglobal_tid  = gtid - feature_id * subglobal_size;
 
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503) for (int i=0; i<5600; ++i) if(feature_data_base[i] == 1) printf("found 1 at %d in feature_data_BASE\n", i); 
-//if (gtid == 5503) for (int i=0; i<5600; ++i) if(feature_data[i] == 1) printf("found 1 at %d in feature_data\n", i); 
-//#endif
-
      data_size_t ind;
      data_size_t ind_next;
      #ifdef IGNORE_INDICES
      ind = subglobal_tid;
-//if (gtid == 5503) printf("gtid = %d (0x%08x), subglobal_tid = %d (0x%08x), ind = %d (0x%08x)\n", gtid, gtid, subglobal_tid, subglobal_tid, ind, ind);
      #else
      ind = data_indices[subglobal_tid];
      #endif
@@ -965,11 +937,7 @@ printf("KERNEL CONST_HESSIAN\n");
      ushort bin;
 
      feature = feature_data[ind >> feature_mask];
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503) printf("gtid = %d (0x%08x), ind = %d (0x%08x), feature_mask = %d,       feature = %d\n", gtid, gtid, ind, ind, feature_mask, feature_data[ind >> feature_mask]);
-//if (gtid == 5503) printf("gtid = %d (0x%08x), ind = %d (0x%08x), feature_mask = %d, BASE  feature = %d\n", gtid, gtid, ind, ind, feature_mask, feature_data_base[ind >> feature_mask]);
-//#endif
-     if (feature_mask) {
+	     if (feature_mask) {
         feature = (feature >> ((ind & 1) << 2)) & 0xf;
      }
      bin = feature;
@@ -980,15 +948,11 @@ printf("KERNEL CONST_HESSIAN\n");
      score_t grad, hess;
      score_t grad_next, hess_next;
      // LGBM_CUDA v5.1
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503) printf("gtid = %d (0x%08x), gradient by 'i': %lf, gradient by 'subglobal_tid': %lf\n", gtid, gtid, ordered_gradients[gtid], ordered_gradients[subglobal_tid]);
-//#endif
      grad = ordered_gradients[ind];
      #if CONST_HESSIAN == 0
      hess = ordered_hessians[ind];
      #endif
 
-
      // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
      for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
          // prefetch the next iteration variables
@@ -997,7 +961,6 @@ printf("KERNEL CONST_HESSIAN\n");
          #ifdef IGNORE_INDICES
          // we need to check to bounds here
          ind_next = i_next < num_data ? i_next : i;
-//if (gtid == 5503) printf("gtid = %d (0x%08x), i = %d (0x%08x), i_next = %d (0x%08x), ind_next = %d (0x%08x)\n", gtid, gtid, i, i, i_next, i_next, ind_next, ind_next);
          #else
          ind_next = data_indices[i_next];
          #endif
@@ -1007,17 +970,10 @@ printf("KERNEL CONST_HESSIAN\n");
          #if CONST_HESSIAN == 0
          hess_next = ordered_hessians[ind_next];
          #endif
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503) printf("gtid = %d (0x%08x), i = %d (0x%08x), i_next = %d (0x%08x), grad_next = %lf\n", gtid, gtid, i, i, i_next, i_next, grad_next);
-//#endif
-
          // STAGE 2: accumulate gradient and hessian
          if (bin != feature) {
              addr_bin = gh_hist + bin * 2 + is_hessian_first;
              #if CONST_HESSIAN == 0
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), prepping to accumulate: is_hessian_first = %d\n", gtid, gtid, is_hessian_first);
-//#endif
              acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
              atomic_local_add_f(addr_bin, acc_bin);
 
@@ -1030,71 +986,42 @@ printf("KERNEL CONST_HESSIAN\n");
              #endif
 
              bin = feature;
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), setting bin = feature 1, grad = %lf, grad_bin = %lf\n", gtid, gtid, grad, grad_bin);
-//#endif
+
              grad_bin = grad;
              hess_bin = hess;
          }
          else {
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x),                          grad = %lf, grad_bin = %lf\n", gtid, gtid, grad, grad_bin);
-//#endif
+
+
              grad_bin += grad;
              hess_bin += hess;
          }
 
          // prefetch the next iteration variables
          feature_next = feature_data[ind_next >> feature_mask];
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503) printf("gtid = %d (0x%08x), ind_next = %d (0x%08x), feature_mask = %d,       feature = %d\n", gtid, gtid, ind_next, ind_next, feature_mask, feature_data[ind_next >> feature_mask]);
-//if (gtid == 5503) printf("gtid = %d (0x%08x), ind_next = %d (0x%08x), feature_mask = %d, BASE  feature = %d\n", gtid, gtid, ind_next, ind_next, feature_mask, feature_data_base[ind_next >> feature_mask]);
-//#endif
 
          // STAGE 3: accumulate counter
          atomicAdd(cnt_hist + feature, 1);
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503 && feature == 1) printf("gtid = %d (0x%08x), adding feature 1 to cnt_hist!\n", gtid, gtid);
-//#endif
 
          // STAGE 4: update next stat
          grad = grad_next;
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503 && feature == 1) printf("gtid = %d (0x%08x), moved grad_next to grad = %lf\n", gtid, gtid, grad);
-//#endif
          hess = hess_next;
          // LGBM_CUDA: v4.2
          if (!feature_mask) {
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503 && feature_next == 1) printf("gtid = %d (0x%08x), moving feature_next 1 into feature 1!\n", gtid, gtid);
-//#endif
              feature = feature_next;
          } else {
              feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
          }
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503) printf("gtid = %d (0x%08x), at end of loop, i = %d, num_data = %d, subglobal_size = %d, feature = %d\n", gtid, gtid, i, num_data, subglobal_size, feature);
-//#endif
      }
 
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), grad = %lf\n", gtid, gtid, grad);
-//#endif
-
      addr_bin = gh_hist + bin * 2 + is_hessian_first;
      #if CONST_HESSIAN == 0
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), prepping to accumulate: is_hessian_first = %d\n", gtid, gtid, is_hessian_first);
-//#endif
      acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
      atomic_local_add_f(addr_bin, acc_bin);
 
      addr_bin = addr_bin + 1 - 2 * is_hessian_first;
      acc_bin = is_hessian_first? grad_bin : hess_bin;
 
-//#ifdef IGNORE_INDICES
-//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), adding %lf to offset %d\n", gtid, gtid, acc_bin, (int) (addr_bin - gh_hist));
-//#endif
      atomic_local_add_f(addr_bin, acc_bin);
 
      #elif CONST_HESSIAN == 1
@@ -1183,8 +1110,6 @@ printf("KERNEL CONST_HESSIAN\n");
          uint skip_id = group_id - output_offset;
          // locate output histogram location for this feature4
          acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-
-         
          within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
      }
 }
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index ae7bf52ce30..230452d7c78 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -93,7 +93,6 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset,
 void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data,
                                                bool is_constant_hessian,
                                                bool reset_multi_val_bin) {
-//fprintf(stderr, "inside SerialTreeLearner::ResetTrainingDataInner\n"); fflush(stderr);
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   CHECK_EQ(num_features_, train_data_->num_features());
@@ -153,9 +152,6 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
 
 Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) {
   Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
-
-fprintf(stderr, "in SerialTreeLearner::Train\n"); fflush(stderr);
-fprintf(stderr, "first few gradients: %lf %lf %lf %lf\n", (double) gradients[0], (double) gradients[1], (double) gradients[2], (double) gradients[3]);
   gradients_ = gradients;
   hessians_ = hessians;
   is_constant_hessian_ = is_constant_hessian;
@@ -185,14 +181,13 @@ fprintf(stderr, "first few gradients: %lf %lf %lf %lf\n", (double) gradients[0],
   int init_splits = 0;
   bool aborted_last_force_split = false;
   if (!forced_split_json.is_null()) {
-//fprintf(stderr, "we're calling ForceSplits\n"); fflush(stderr);
     init_splits = ForceSplits(tree.get(), forced_split_json, &left_leaf,
                               &right_leaf, &cur_depth, &aborted_last_force_split);
   }
 
-//fprintf(stderr, "loop start value = %d, loop end value = %d\n", init_splits, config_->num_leaves - 1); fflush(stderr);
   for (int split = init_splits; split < config_->num_leaves - 1; ++split) {
     // some initial works before finding best split
+
     if (BeforeFindBestSplit(tree_prt, left_leaf, right_leaf)) {
       // find best threshold for every feature
       FindBestSplits(tree_prt);
@@ -207,27 +202,10 @@ fprintf(stderr, "first few gradients: %lf %lf %lf %lf\n", (double) gradients[0],
       break;
     }
     // split tree with best leaf
-
-//fprintf(stderr, "%3d ", best_split_per_leaf_[0].feature);
-//fprintf(stderr, "%3d ", best_split_per_leaf_[0].threshold);
-//fprintf(stderr, "%3d ", best_split_per_leaf_[0].left_count);
-//fprintf(stderr, "%3d ", best_split_per_leaf_[0].right_count);
-//fprintf(stderr, "%3d ", best_split_per_leaf_[0].num_cat_threshold);
-//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_output);
-//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_output);
-//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].gain);
-//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_sum_gradient);
-//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_sum_hessian);
-//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_sum_gradient);
-//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_sum_hessian);
-//fprintf(stderr, "\n"); 
-
-//fprintf(stderr, "Calling Split, best_leaf = %d\n", best_leaf);
     Split(tree_prt, best_leaf, &left_leaf, &right_leaf);
     cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
   }
   Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth);
-fprintf(stderr, "Leaving SerialTreeLearner::Train\n"); fflush(stderr);
   return tree.release();
 }
 
@@ -345,7 +323,6 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int
 
 void SerialTreeLearner::FindBestSplits(const Tree* tree) {
   std::vector<int8_t> is_feature_used(num_features_, 0);
-//fprintf(stderr, "in FindBestSplits, num_features_ = %d\n", num_features_); fflush(stderr);
   #pragma omp parallel for schedule(static, 256) if (num_features_ >= 512)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
     if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
@@ -357,23 +334,16 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) {
     is_feature_used[feature_index] = 1;
   }
   bool use_subtract = parent_leaf_histogram_array_ != nullptr;
-//for (int i=0; i<num_features_; ++i) fprintf(stderr, "%d ", is_feature_used[i]);
-//fprintf(stderr, "\n"); 
-//fprintf(stderr, "use_subtract = %d\n", use_subtract); 
-fprintf(stderr, "at the fork between CPU and CUDA\n");
 
 #ifdef USE_CUDA
   if (LGBM_config_::current_learner == use_cpu_learner){
       Log::Info("LightGBM-CUDA using CPU ConstructHistograms()");
-fprintf(stderr, "calling CPU ConstructHistograms\n"); 
       SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); 
   }
   else{
-fprintf(stderr, "calling CUDA ConstructHistograms\n"); 
       ConstructHistograms(is_feature_used, use_subtract);
   }
 #else
-fprintf(stderr, "calling CPU ConstructHistograms\n"); 
   ConstructHistograms(is_feature_used, use_subtract);
 #endif
   FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
@@ -385,24 +355,20 @@ void SerialTreeLearner::ConstructHistograms(
                                   global_timer);
   // construct smaller leaf
   hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
-//fprintf(stderr, "calling train_data_->ConstructHistograms(smaller)\n"); fflush(stderr);
   train_data_->ConstructHistograms(
       is_feature_used, smaller_leaf_splits_->data_indices(),
       smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
       ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
       ptr_smaller_leaf_hist_data);
-//fprintf(stderr, "back from train_data_->ConstructHistograms(smaller)\n"); fflush(stderr);
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
     // construct larger leaf
     hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
-//fprintf(stderr, "calling train_data_->ConstructHistograms(larger)\n"); fflush(stderr);
     train_data_->ConstructHistograms(
         is_feature_used, larger_leaf_splits_->data_indices(),
         larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
         ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
         ptr_larger_leaf_hist_data);
-//fprintf(stderr, "back from train_data_->ConstructHistograms(larger)\n"); fflush(stderr);
   }
 }
 
@@ -410,9 +376,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
     const std::vector<int8_t>& is_feature_used, bool use_subtract, const Tree* tree) {
   Common::FunctionTimer fun_timer(
       "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
-fflush(stdout);
-fflush(stderr);
-//fprintf(stderr, "inside FindBestSplitsFromHistograms, num_threads = %d\n", (int) share_state_->num_threads); fflush(stderr);
   std::vector<SplitInfo> smaller_best(share_state_->num_threads);
   std::vector<SplitInfo> larger_best(share_state_->num_threads);
   std::vector<int8_t> smaller_node_used_features = col_sampler_.GetByNode(tree, smaller_leaf_splits_->leaf_index());
@@ -429,6 +392,7 @@ fflush(stderr);
       continue;
     }
     const int tid = omp_get_thread_num();
+
     train_data_->FixHistogram(
         feature_index, smaller_leaf_splits_->sum_gradients(),
         smaller_leaf_splits_->sum_hessians(),
@@ -462,12 +426,12 @@ fflush(stderr);
                                larger_node_used_features[feature_index],
                                larger_leaf_splits_->num_data_in_leaf(),
                                larger_leaf_splits_.get(), &larger_best[tid]);
-
     OMP_LOOP_EX_END();
   }
   OMP_THROW_EX();
   auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
   int leaf = smaller_leaf_splits_->leaf_index();
+
   best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
 
   if (larger_leaf_splits_ != nullptr &&
@@ -476,7 +440,6 @@ fflush(stderr);
     auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
     best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
   }
-//fprintf(stderr, "leaving FindBestSplitsFromHistograms\n"); fflush(stderr);
 }
 
 int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
@@ -682,7 +645,6 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
   CHECK(*right_leaf == next_leaf_id);
 #endif
 
-fprintf(stderr, "arrived at the assert, leaves = %d %d, sum = %d\n", best_split_info.left_count, best_split_info.right_count, best_split_info.left_count + best_split_info.right_count); fflush(stderr);
   // init the leaves that used on next iteration
   if (best_split_info.left_count < best_split_info.right_count) {
     CHECK_GT(best_split_info.left_count, 0);
@@ -760,6 +722,7 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
     FeatureHistogram* histogram_array_, int feature_index, int real_fidx,
     bool is_feature_used, int num_data, const LeafSplits* leaf_splits,
     SplitInfo* best_split) {
+
   if (!is_feature_used) {
     return;
   }
@@ -774,9 +737,11 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
   } else {
     parent_output = leaf_splits->weight();
   }
+
   histogram_array_[feature_index].FindBestThreshold(
       leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
       constraints_->Get(leaf_splits->leaf_index()),  parent_output, &new_split);
+
   new_split.feature = real_fidx;
   if (cegb_ != nullptr) {
     new_split.gain -=
diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py
index f6e955ee48d..0e3fc509144 100644
--- a/tests/python_package_test/test_consistency.py
+++ b/tests/python_package_test/test_consistency.py
@@ -39,7 +39,8 @@ def load_cpp_result(self, result_file='LightGBM_predict_result.txt'):
 
     def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred):
         params = dict(self.params)
-        params['force_row_wise'] = True
+        # KNOWN BUG (the CUDA kernel cannot handle "row wise", so we disable it in this test)
+        # params['force_row_wise'] = True
         gbm = lgb.train(params, lgb_train)
         y_pred = gbm.predict(X_test)
         cpp_pred = gbm.predict(X_test_fn)

From 8fa83181d2021431821b584d494af26b4f79ec2d Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Wed, 3 Jun 2020 15:26:27 +0000
Subject: [PATCH 049/119] Initial CUDA work

---
 src/boosting/rf.hpp                            | 2 --
 src/io/dataset.cpp                             | 4 ++--
 src/treelearner/feature_histogram.hpp          | 4 ++--
 src/treelearner/kernels/histogram_16_64_256.cu | 3 ---
 4 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp
index 6912e0757d6..e64bf6cb4d8 100644
--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -102,9 +102,7 @@ class RF : public GBDT {
 
   bool TrainOneIter(const score_t* gradients, const score_t* hessians) override {
     // bagging logic
-fprintf(stderr, "calling Bagging in TrainOneIter\n"); fflush(stderr);
     Bagging(iter_);
-fprintf(stderr, "back from Bagging in TrainOneIter\n"); fflush(stderr);
     CHECK_EQ(gradients, nullptr);
     CHECK_EQ(hessians, nullptr);
 
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index fc71aeb43cd..ccdf0b21576 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -1312,11 +1312,11 @@ void Dataset::ConstructHistogramsInner(
     TrainingShareStates* share_state, hist_t* hist_data) const {
 
   if (!share_state->is_colwise) {
-fprintf(stderr, "CPU ('multival') hist_data = %p\n", hist_data); fflush(stderr);
+//fprintf(stderr, "CPU ('multival') hist_data = %p\n", hist_data); fflush(stderr);
     return ConstructHistogramsMultiVal<USE_INDICES, false>(
         data_indices, num_data, gradients, hessians, share_state, hist_data);
   }
-fprintf(stderr, "CPU (not 'multival')\n"); fflush(stderr);
+//fprintf(stderr, "CPU (not 'multival')\n"); fflush(stderr);
 
   std::vector<int> used_dense_group;
   int multi_val_groud_id = -1;
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index bf3d81c53d8..c7371d6a31a 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -1201,8 +1201,8 @@ class HistogramPool {
     for (int i = old_cache_size; i < cache_size; ++i) {
       OMP_LOOP_EX_BEGIN();
       pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
-      //data_[i].resize(num_total_bin * 2);
-      data_[i].resize(num_total_bin * 3); // GCF HACK to avoid mysterious core dumps
+      data_[i].resize(num_total_bin * 2);
+      //data_[i].resize(num_total_bin * 3); // GCF HACK to avoid mysterious core dumps
       for (int j = 0; j < train_data->num_features(); ++j) {
         pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
       }
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 991444fbf62..8828e55ac11 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -986,13 +986,10 @@ printf("KERNEL CONST_HESSIAN\n");
              #endif
 
              bin = feature;
-
              grad_bin = grad;
              hess_bin = hess;
          }
          else {
-
-
              grad_bin += grad;
              hess_bin += hess;
          }

From f70beb7470869ff4fb5498f0b9c99cb236e555b8 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Wed, 3 Jun 2020 17:37:05 +0000
Subject: [PATCH 050/119] Initial CUDA work

---
 src/io/dataset.cpp                            | 4 ++++
 tests/python_package_test/test_consistency.py | 3 +--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index ccdf0b21576..820e34e5856 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -627,6 +627,10 @@ TrainingShareStates* Dataset::GetShareStates(
     share_state->is_constant_hessian = is_constant_hessian;
     return share_state;
   }
+#ifdef USE_CUDA
+  force_colwise = 1;
+  force_rowwise = 0;
+#endif
   if (force_colwise) {
     TrainingShareStates* share_state = new TrainingShareStates();
     share_state->SetMultiValBin(GetMultiBinFromSparseFeatures());
diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py
index 0e3fc509144..f6e955ee48d 100644
--- a/tests/python_package_test/test_consistency.py
+++ b/tests/python_package_test/test_consistency.py
@@ -39,8 +39,7 @@ def load_cpp_result(self, result_file='LightGBM_predict_result.txt'):
 
     def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred):
         params = dict(self.params)
-        # KNOWN BUG (the CUDA kernel cannot handle "row wise", so we disable it in this test)
-        # params['force_row_wise'] = True
+        params['force_row_wise'] = True
         gbm = lgb.train(params, lgb_train)
         y_pred = gbm.predict(X_test)
         cpp_pred = gbm.predict(X_test_fn)

From af49c3202e8e89e12b61fd34b8113898e045a571 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 4 Jun 2020 14:44:15 +0000
Subject: [PATCH 051/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index b8eca14f9b3..78182ac2a90 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -206,7 +206,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
     if (num_workgroups > preallocd_max_num_wg_[device_id]) {
       preallocd_max_num_wg_.at(device_id) = num_workgroups;
       CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
-      cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)));
     }
     //set thread_data
     SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features,
@@ -396,7 +396,7 @@ void CUDATreeLearner::AllocateGPUMemory() {
 
       // copy indices to the device
 
-     if (device_feature_masks_[device_id] != NULL){
+     if (device_data_indices_[device_id] != NULL){
         CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); 
      }
 
@@ -453,6 +453,7 @@ void CUDATreeLearner::copyDenseFeature() {
   // set device info 
   int device_id = 0;
   uint8_t* device_features = device_features_[device_id];
+  CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
   Log::Debug("Started copying dense features from CPU to GPU - 1");
 
   for (int i = 0; i < num_feature_groups_; ++i) {
@@ -499,6 +500,9 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   printf("bin_size: ");
   #endif
   for (int i = 0; i < num_feature_groups_; ++i) {
+    if (train_data_->IsMultiGroup(i)) {
+      continue;
+    }
     #if GPU_DEBUG >= 1
     printf("%d, ", train_data_->FeatureGroupNumBin(i));
     #endif

From 038128ddde974ca1ff8eee1f2c62f82db5b25363 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Thu, 4 Jun 2020 17:39:30 +0000
Subject: [PATCH 052/119] Initial CUDA work

---
 include/LightGBM/bin.h                        |  5 ---
 src/io/dataset.cpp                            | 10 ++----
 src/io/dense_bin.hpp                          | 30 ------------------
 src/io/sparse_bin.hpp                         | 31 -------------------
 .../kernels/histogram_16_64_256.cu            |  4 ++-
 src/treelearner/serial_tree_learner.cpp       |  1 -
 6 files changed, 5 insertions(+), 76 deletions(-)

diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index c09cde3c809..96ae6a8d641 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -309,11 +309,6 @@ class Bin {
   * \param out Output Result
   */
 
-  virtual void ConstructHistogramDebug(
-    data_size_t start, data_size_t end,
-    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    hist_t* out) const = 0;
-
   virtual void ConstructHistogram(
     const data_size_t* data_indices, data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 820e34e5856..0f7e0401285 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -1382,14 +1382,8 @@ void Dataset::ConstructHistogramsInner(
               data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
               data_ptr);
         } else {
-          if (gi == 0) {
-             feature_groups_[group]->bin_data_->ConstructHistogramDebug(
-                 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
-          }
-          else {
-             feature_groups_[group]->bin_data_->ConstructHistogram(
-                 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
-          }
+          feature_groups_[group]->bin_data_->ConstructHistogram(
+              0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
         }
       } else {
         if (USE_INDICES) {
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index fc0fe8fbd57..99feadf9f7f 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -100,28 +100,6 @@ class DenseBin : public Bin {
   BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin,
                            uint32_t most_freq_bin) const override;
 
-  template <bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
-  void ConstructHistogramInnerDebug(data_size_t start, data_size_t end,
-                               const score_t* ordered_gradients,
-                               const score_t* ordered_hessians,
-                               hist_t* out) const {
-    data_size_t i = start;
-    hist_t* grad = out;
-    hist_t* hess = out + 1;
-    hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(hess);
-    for (; i < end; ++i) {
-      const auto idx = i;
-      const auto ti = static_cast<uint32_t>(data(idx)) << 1;
-      if (USE_HESSIAN) {
-        grad[ti] += ordered_gradients[i];
-        hess[ti] += ordered_hessians[i];
-      } else {
-        grad[ti] += ordered_gradients[i];
-        ++cnt[ti];
-      }
-    }
-  }
-
   template <bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
   void ConstructHistogramInner(const data_size_t* data_indices,
                                data_size_t start, data_size_t end,
@@ -167,14 +145,6 @@ class DenseBin : public Bin {
     }
   }
 
-  void ConstructHistogramDebug(data_size_t start,
-                          data_size_t end, const score_t* ordered_gradients,
-                          const score_t* ordered_hessians,
-                          hist_t* out) const {
-    ConstructHistogramInnerDebug<true, true, true>(
-        start, end, ordered_gradients, ordered_hessians, out);
-  }
-
   void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
                           data_size_t end, const score_t* ordered_gradients,
                           const score_t* ordered_hessians,
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index 74cdb08c82b..c56cd6da99d 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -98,37 +98,6 @@ class SparseBin : public Bin {
   hist[ti] += g;                            \
   hist[ti + 1] += h;
 
-  void ConstructHistogramDebug(data_size_t start,
-                          data_size_t end, const score_t* ordered_gradients,
-                          const score_t* ordered_hessians,
-                          hist_t* out) const {
-    data_size_t i_delta, cur_pos;
-    InitIndex(start, &i_delta, &cur_pos);
-    data_size_t i = start;
-    for (;;) {
-      if (cur_pos < i) {
-        cur_pos += deltas_[++i_delta];
-        if (i_delta >= num_vals_) {
-          break;
-        }
-      } else if (cur_pos > i) {
-        if (++i >= end) {
-          break;
-        }
-      } else {
-        const VAL_T bin = vals_[i_delta];
-        ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
-        if (++i >= end) {
-          break;
-        }
-        cur_pos += deltas_[++i_delta];
-        if (i_delta >= num_vals_) {
-          break;
-        }
-      }
-    }
-  }
-
   void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
                           data_size_t end, const score_t* ordered_gradients,
                           const score_t* ordered_hessians,
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 8828e55ac11..7002ac71ded 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -844,7 +844,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
-if (gtid == 2048) {
+#if 0
+if (gtid == 0) {
 #if USE_CONSTANT_BUF == 1
 #ifdef IGNORE_INDICES
 #if CONST_HESSIAN == 0
@@ -875,6 +876,7 @@ printf("KERNEL CONST_HESSIAN\n");
 #endif
 #endif
 }
+#endif
 
      // local memory per workgroup is 3 KB
      // clear local memory
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 230452d7c78..e5b6626a6bd 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -337,7 +337,6 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) {
 
 #ifdef USE_CUDA
   if (LGBM_config_::current_learner == use_cpu_learner){
-      Log::Info("LightGBM-CUDA using CPU ConstructHistograms()");
       SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); 
   }
   else{

From 3fd7618d7aca3381582fe9b95e03b989e308e835 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Thu, 4 Jun 2020 20:43:02 +0000
Subject: [PATCH 053/119] Initial CUDA work

---
 include/LightGBM/config.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 9622814832b..562ae79d388 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -955,8 +955,8 @@ struct Config {
   bool gpu_use_dp = false;
 
   // desc = number of gpus (CUDA implementation only) LGBM_CUDA
-  // desc = default value is 1
-  int num_gpu = 1;
+  // desc = default value is 4
+  int num_gpu = 4;
 
   #pragma endregion
 

From 7e692c24b8cb50f1030e7d450aff16a336202eaf Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Thu, 4 Jun 2020 21:07:26 +0000
Subject: [PATCH 054/119] Initial CUDA work

---
 include/LightGBM/config.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 562ae79d388..51e4ea44c32 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -956,7 +956,11 @@ struct Config {
 
   // desc = number of gpus (CUDA implementation only) LGBM_CUDA
   // desc = default value is 4
+#ifdef USE_CUDA
   int num_gpu = 4;
+#else
+  int num_gpu = 1;
+#endif
 
   #pragma endregion
 

From b27b7e1fa4f6f2583f3620604455bd354fe306f7 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Thu, 4 Jun 2020 22:47:54 +0000
Subject: [PATCH 055/119] Initial CUDA work

---
 include/LightGBM/config.h | 4 +---
 src/io/config_auto.cpp    | 4 ++++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 51e4ea44c32..c7be0e89884 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -954,12 +954,10 @@ struct Config {
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   bool gpu_use_dp = false;
 
+#ifdef USE_CUDA
   // desc = number of gpus (CUDA implementation only) LGBM_CUDA
   // desc = default value is 4
-#ifdef USE_CUDA
   int num_gpu = 4;
-#else
-  int num_gpu = 1;
 #endif
 
   #pragma endregion
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 46d95b0df8f..9408a97c70f 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -294,7 +294,9 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "gpu_platform_id",
   "gpu_device_id",
   "gpu_use_dp",
+#ifdef USE_CUDA
   "num_gpu", /* LGBM_CUDA */
+#endif
   });
   return params;
 }
@@ -613,9 +615,11 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
   GetBool(params, "gpu_use_dp", &gpu_use_dp);
   gpu_use_dp = true;  /* LGBM_CUDA hard-coding gpu_use_dp to TRUE (default is false) */
 
+#ifdef USE_CUDA
   /* LGBM_CUDA get number of GPUs */
   GetInt(params, "num_gpu", &num_gpu);
   CHECK(num_gpu > 0);
+#endif
 
 }
 

From 80a8f43e18ee0bb1643f40141f11d610aa61f3de Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 8 Jun 2020 14:30:32 +0000
Subject: [PATCH 056/119] Initial CUDA work

---
 include/LightGBM/config.h             |  4 ++--
 src/io/dataset.cpp                    |  2 --
 src/treelearner/cuda_tree_learner.cpp | 11 ++---------
 src/treelearner/feature_histogram.hpp |  1 -
 4 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index c7be0e89884..162c7583dc7 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -956,8 +956,8 @@ struct Config {
 
 #ifdef USE_CUDA
   // desc = number of gpus (CUDA implementation only) LGBM_CUDA
-  // desc = default value is 4
-  int num_gpu = 4;
+  // desc = default value is 1
+  int num_gpu = 1;
 #endif
 
   #pragma endregion
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 0f7e0401285..c96a83516bf 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -1316,11 +1316,9 @@ void Dataset::ConstructHistogramsInner(
     TrainingShareStates* share_state, hist_t* hist_data) const {
 
   if (!share_state->is_colwise) {
-//fprintf(stderr, "CPU ('multival') hist_data = %p\n", hist_data); fflush(stderr);
     return ConstructHistogramsMultiVal<USE_INDICES, false>(
         data_indices, num_data, gradients, hessians, share_state, hist_data);
   }
-//fprintf(stderr, "CPU (not 'multival')\n"); fflush(stderr);
 
   std::vector<int> used_dense_group;
   int multi_val_groud_id = -1;
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 78182ac2a90..f488e5c4a75 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -919,26 +919,19 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
   }
 
-  // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
+  // ConstructGPUHistogramsAsync will return true if there are availabe feature groups dispatched to GPU
   bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
     nullptr, smaller_leaf_splits_->num_data_in_leaf());
 
   // then construct sparse features on CPU
   // We set data_indices to null to avoid rebuilding ordered gradients/hessians
   if (num_sparse_features > 0){
-//  train_data_->ConstructHistograms(is_sparse_feature_used,
-//    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
-//    smaller_leaf_splits_->leaf_index(),
-//    ordered_bins_, gradients_, hessians_,
-//    ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
-//    ptr_smaller_leaf_hist_data);
-  train_data_->ConstructHistograms(is_sparse_feature_used,
+    train_data_->ConstructHistograms(is_sparse_feature_used,
     smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
     gradients_, hessians_,
     ordered_gradients_.data(), ordered_hessians_.data(),
     share_state_.get(),
     ptr_smaller_leaf_hist_data);
-
   }
 
   // wait for GPU to finish, only if GPU is actually used
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index c7371d6a31a..8916ee48fd4 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -1202,7 +1202,6 @@ class HistogramPool {
       OMP_LOOP_EX_BEGIN();
       pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
       data_[i].resize(num_total_bin * 2);
-      //data_[i].resize(num_total_bin * 3); // GCF HACK to avoid mysterious core dumps
       for (int j = 0; j < train_data->num_features(); ++j) {
         pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
       }

From baf6f792495e68cdb87945392f7a3dd4cbcfaf8a Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 8 Jun 2020 15:32:49 +0000
Subject: [PATCH 057/119] Initial CUDA work

---
 src/boosting/gbdt.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 3f521ebf333..24264c3c175 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -382,8 +382,8 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
 // LGBM_CUDA
 bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
 
- // LGBM_CUDA invoke baggging during the first iteration
- if (config_->device_type == std::string("cuda")) {
+ // LGBM_CUDA invoke bagging during the first iteration
+ if (config_->device_type == std::string("cuda") && (iter_ == 0)) {
 
 //    auto start_time = std::chrono::steady_clock::now();
 

From 944a3e57a93c56f9e3c5c243e5e392f4092059b4 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 8 Jun 2020 16:33:30 +0000
Subject: [PATCH 058/119] Initial CUDA work

---
 src/treelearner/cuda_tree_learner.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index f488e5c4a75..f45319ae818 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -297,7 +297,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
   // leave some safe margin for prefetching
   // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
 
-  allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
+  allocated_num_data_ = std::max(num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature), allocated_num_data_);
 
   // clear sparse/dense maps
 
@@ -594,6 +594,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
     CUDASUCCESS_OR_FATAL(cudaEventCreate(&(histograms_wait_obj_[i])));
   }
 
+  allocated_num_data_ = 0;
   prevAllocateGPUMemory();
 
   AllocateGPUMemory();
@@ -626,7 +627,7 @@ Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians,
 void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
 
   // LGBM_CUDA: check data size
-  data_size_t old_num_data = num_data_;  
+  data_size_t old_allocated_num_data = allocated_num_data_;  
 
   SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
 
@@ -646,7 +647,7 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_
 
   int old_num_feature_groups = num_dense_feature_groups_;
   CountDenseFeatureGroups();
-  if ((old_num_data < num_data_) || (old_num_feature_groups < num_dense_feature_groups_)) {
+  if ((old_allocated_num_data < (num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature))) || (old_num_feature_groups < num_dense_feature_groups_)) {
     prevAllocateGPUMemory();
     AllocateGPUMemory();
   } else {

From f34ec350b2d7a39fc56ea0dc23e7707e50be37e4 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 8 Jun 2020 17:13:25 +0000
Subject: [PATCH 059/119] Initial CUDA work

---
 src/io/config.cpp  | 10 ++++++++++
 src/io/dataset.cpp |  4 ----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/io/config.cpp b/src/io/config.cpp
index 963ef084578..18c0562a676 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -321,11 +321,21 @@ void Config::CheckParamConflict() {
       num_leaves = static_cast<int>(full_num_leaves);
     }
   }
+
   // force col-wise for gpu
   if (device_type == std::string("gpu")) {
     force_col_wise = true;
     force_row_wise = false;
   }
+
+#ifdef USE_CUDA
+  // force col-wise for CUDA
+  if (device_type == std::string("cuda")) {
+    force_col_wise = true;
+    force_row_wise = false;
+  }
+#endif
+
   // min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated
   // the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can
   // be 1 when there is actually no data in the leaf. In rare cases this can cause a bug because with path smoothing the
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index c96a83516bf..edae575f345 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -627,10 +627,6 @@ TrainingShareStates* Dataset::GetShareStates(
     share_state->is_constant_hessian = is_constant_hessian;
     return share_state;
   }
-#ifdef USE_CUDA
-  force_colwise = 1;
-  force_rowwise = 0;
-#endif
   if (force_colwise) {
     TrainingShareStates* share_state = new TrainingShareStates();
     share_state->SetMultiValBin(GetMultiBinFromSparseFeatures());

From 44ce402d27b99ce828d92498399af7383339b1db Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Mon, 8 Jun 2020 17:17:17 +0000
Subject: [PATCH 060/119] Initial CUDA work

---
 .../kernels/histogram_16_64_256.cu            | 34 -------------------
 1 file changed, 34 deletions(-)

diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 7002ac71ded..a0780f913c9 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -844,40 +844,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      const ushort lsize = NUM_BINS; // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
-#if 0
-if (gtid == 0) {
-#if USE_CONSTANT_BUF == 1
-#ifdef IGNORE_INDICES
-#if CONST_HESSIAN == 0
-printf("KERNEL USE_CONSTANT_BUF IGNORE_INDICES\n"); 
-#else
-printf("KERNEL USE_CONSTANT_BUF IGNORE_INDICES CONST_HESSIAN\n"); 
-#endif
-#else
-#if CONST_HESSIAN == 0
-printf("KERNEL USE_CONSTANT_BUF \n"); 
-#else
-printf("KERNEL USE_CONSTANT_BUF CONST_HESSIAN\n"); 
-#endif
-#endif
-#else
-#ifdef IGNORE_INDICES
-#if CONST_HESSIAN == 0
-printf("KERNEL IGNORE_INDICES (exp = %d) (feature_size = %d)\n", (int) power_feature_workgroups, (int) feature_size); 
-#else
-printf("KERNEL IGNORE_INDICES CONST_HESSIAN\n"); 
-#endif
-#else
-#if CONST_HESSIAN == 0
-printf("KERNEL (exp = %d) (feature_size = %d)\n", (int) power_feature_workgroups, (int) feature_size); 
-#else
-printf("KERNEL CONST_HESSIAN\n"); 
-#endif
-#endif
-#endif
-}
-#endif
-
      // local memory per workgroup is 3 KB
      // clear local memory
      uint *ptr = (uint *) shared_array;

From d7e34de5ef668e5c1da8a2bba3add18ac5fcd69d Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 9 Jun 2020 13:12:19 +0000
Subject: [PATCH 061/119] Initial CUDA work

---
 src/c_api.cpp | 168 +++++++++-----------------------------------------
 1 file changed, 28 insertions(+), 140 deletions(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 03a3db597bb..6a54b5f1788 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -657,6 +657,20 @@ int LGBM_GetDeviceType() {
 #endif
 }
 
+//LGBM_CUDA
+void AdditionalConfig(Config *config)
+{
+#ifdef USE_CUDA
+  if (config->device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;}
+      if (config->bagging_freq == 0) {config->bagging_freq = 1;}
+  }
+#endif
+}
+
 int LGBM_DatasetCreateFromFile(const char* filename,
                                const char* parameters,
                                const DatasetHandle reference,
@@ -669,16 +683,7 @@ int LGBM_DatasetCreateFromFile(const char* filename,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   DatasetLoader loader(config, nullptr, 1, filename);
   if (reference == nullptr) {
@@ -711,16 +716,7 @@ int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   DatasetLoader loader(config, nullptr, 1, nullptr);
   *out = loader.CostructFromSampleData(sample_data, sample_indices, ncol, num_per_col,
@@ -834,16 +830,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   std::unique_ptr<Dataset> ret;
   int32_t total_nrow = 0;
@@ -937,16 +924,7 @@ int LGBM_DatasetCreateFromCSR(const void* indptr,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   std::unique_ptr<Dataset> ret;
   auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
@@ -1016,16 +994,7 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   std::unique_ptr<Dataset> ret;
   int32_t nrow = num_rows;
@@ -1099,16 +1068,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   std::unique_ptr<Dataset> ret;
   int32_t nrow = static_cast<int32_t>(num_row);
@@ -1194,16 +1154,7 @@ int LGBM_DatasetGetSubset(
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   auto full_dataset = reinterpret_cast<const Dataset*>(handle);
   CHECK_GT(num_used_row_indices, 0);
@@ -1601,16 +1552,7 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header,
@@ -1657,16 +1599,7 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
@@ -1703,16 +1636,7 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
@@ -1744,16 +1668,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   int num_threads = OMP_NUM_THREADS();
   int ncol = static_cast<int>(ncol_ptr - 1);
@@ -1800,16 +1715,7 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
@@ -1836,16 +1742,7 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major);
@@ -1872,16 +1769,7 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-  if (config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;}
-      if (config.bagging_freq == 0) {config.bagging_freq = 1;}
-  }
-#endif
+  AdditionalConfig(&config);
 
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type);

From 8af3738907029d4e78d78ba0f450fb2d280df7e1 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 9 Jun 2020 16:55:03 +0000
Subject: [PATCH 062/119] Initial CUDA work

---
 src/c_api.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 6a54b5f1788..ba6ef705573 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -668,6 +668,8 @@ void AdditionalConfig(Config *config)
       if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;}
       if (config->bagging_freq == 0) {config->bagging_freq = 1;}
   }
+#else
+  (void)(config);	// UNUSED
 #endif
 }
 

From 903e52b752c2f0ed54e9a61edb6c0a8871cd6f77 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 9 Jun 2020 18:30:42 +0000
Subject: [PATCH 063/119] Initial CUDA work

---
 src/c_api.cpp | 53 ++++++++++++++++++---------------------------------
 1 file changed, 19 insertions(+), 34 deletions(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index ba6ef705573..9ccac3893e5 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -43,6 +43,22 @@ inline int LGBM_APIHandleException(const std::string& ex) {
   return -1;
 }
 
+//LGBM_CUDA
+inline void AdditionalConfig(Config *config)
+{
+#ifdef USE_CUDA
+  if (config->device_type == std::string("cuda")){
+      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+
+      config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
+      if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;}
+      if (config->bagging_freq == 0) {config->bagging_freq = 1;}
+  }
+#else
+  (void)(config);       // UNUSED
+#endif
+}
+
 #define API_BEGIN() try {
 #define API_END() } \
 catch(std::exception& ex) { return LGBM_APIHandleException(ex); } \
@@ -120,16 +136,10 @@ class Booster {
     if (train_data->num_data() < 2048){
        config_.device_type = std::string("cpu");
     }
-  
-    if (config_.device_type == std::string("cuda")){
-           LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-  
-            config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-            if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;}
-            if (config_.bagging_freq == 0) {config_.bagging_freq = 1;}
-    }
 #endif
 
+    AdditionalConfig(&config_);
+
     // create boosting
     if (config_.input_model.size() > 0) {
       Log::Warning("Continued train from model is not supported for c_api,\n"
@@ -323,16 +333,7 @@ class Booster {
       omp_set_num_threads(config_.num_threads);
     }
 
-//LGBM_CUDA
-#ifdef USE_CUDA
-    if (config_.device_type == std::string("cuda")){
-        LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-  
-        config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-        if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;}
-        if (config_.bagging_freq == 0) {config_.bagging_freq = 1;}
-    }
-#endif
+    AdditionalConfig(&config_);
 
     if (param.count("objective")) {
       // create objective function
@@ -657,22 +658,6 @@ int LGBM_GetDeviceType() {
 #endif
 }
 
-//LGBM_CUDA
-void AdditionalConfig(Config *config)
-{
-#ifdef USE_CUDA
-  if (config->device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
-
-      config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;}
-      if (config->bagging_freq == 0) {config->bagging_freq = 1;}
-  }
-#else
-  (void)(config);	// UNUSED
-#endif
-}
-
 int LGBM_DatasetCreateFromFile(const char* filename,
                                const char* parameters,
                                const DatasetHandle reference,

From 1efcad08a3fae62fd611ca7d601745a81c965287 Mon Sep 17 00:00:00 2001
From: Guolin Ke <guolin.ke@outlook.com>
Date: Thu, 21 May 2020 02:50:58 +0800
Subject: [PATCH 064/119] redirect log to python console (#3090)

* redir log to python console

* fix pylint

* Apply suggestions from code review

* Update basic.py

* Apply suggestions from code review

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>

* Update c_api.h

* Apply suggestions from code review

* Apply suggestions from code review

* super-minor: better wording

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
Co-authored-by: StrikerRUS <nekit94-12@hotmail.com>
---
 src/c_api.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 9ccac3893e5..54fabba1665 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -658,6 +658,12 @@ int LGBM_GetDeviceType() {
 #endif
 }
 
+int LGBM_RegisterLogCallback(void (*callback)(const char*)) {
+  API_BEGIN();
+  Log::ResetCallBack(callback);
+  API_END();
+}
+
 int LGBM_DatasetCreateFromFile(const char* filename,
                                const char* parameters,
                                const DatasetHandle reference,

From 13c6450a84c484de0177c089d77e93cd3c0ddf9a Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 1 Jun 2020 20:13:32 +0100
Subject: [PATCH 065/119] re-order includes (fixes #3132) (#3133)

---
 R-package/src/lightgbm_R.cpp                    | 14 +++++++-------
 include/LightGBM/application.h                  |  6 +++---
 include/LightGBM/bin.h                          |  8 ++++----
 include/LightGBM/boosting.h                     |  6 +++---
 include/LightGBM/c_api.h                        |  4 ++--
 include/LightGBM/config.h                       | 10 +++++-----
 include/LightGBM/dataset.h                      | 14 +++++++-------
 include/LightGBM/dataset_loader.h               |  4 ++--
 include/LightGBM/feature_group.h                |  8 ++++----
 include/LightGBM/metric.h                       |  6 +++---
 include/LightGBM/network.h                      |  8 ++++----
 include/LightGBM/objective_function.h           |  6 +++---
 include/LightGBM/prediction_early_stop.h        |  4 ++--
 include/LightGBM/tree.h                         |  6 +++---
 include/LightGBM/tree_learner.h                 |  6 +++---
 include/LightGBM/utils/array_args.h             |  6 +++---
 include/LightGBM/utils/common.h                 |  5 ++---
 include/LightGBM/utils/openmp_wrapper.h         |  8 ++++----
 include/LightGBM/utils/pipeline_reader.h        |  6 +++---
 include/LightGBM/utils/text_reader.h            |  8 ++++----
 include/LightGBM/utils/threading.h              |  8 ++++----
 src/application/application.cpp                 | 17 +++++++++--------
 src/application/predictor.hpp                   | 12 ++++++------
 src/boosting/dart.hpp                           |  4 ++--
 src/boosting/gbdt.cpp                           |  9 +++++----
 src/boosting/gbdt.h                             | 16 ++++++++--------
 src/boosting/gbdt_model_text.cpp                |  9 +++++----
 src/boosting/gbdt_prediction.cpp                |  4 ++--
 src/boosting/goss.hpp                           |  8 ++++----
 src/boosting/prediction_early_stop.cpp          |  7 ++++---
 src/boosting/rf.hpp                             |  6 +++---
 src/boosting/score_updater.hpp                  |  6 +++---
 src/c_api.cpp                                   | 17 +++++++++++++----
 src/io/bin.cpp                                  | 11 ++++++-----
 src/io/config.cpp                               |  5 +++--
 src/io/dataset.cpp                              | 13 +++++++------
 src/io/dataset_loader.cpp                       |  5 +++--
 src/io/dense_bin.hpp                            |  4 ++--
 src/io/file_io.cpp                              |  7 ++++---
 src/io/json11.cpp                               |  7 ++++---
 src/io/metadata.cpp                             |  5 +++--
 src/io/multi_val_dense_bin.hpp                  |  6 +++---
 src/io/multi_val_sparse_bin.hpp                 |  6 +++---
 src/io/parser.cpp                               |  3 ++-
 src/io/parser.hpp                               |  8 ++++----
 src/io/sparse_bin.hpp                           |  8 ++++----
 src/io/tree.cpp                                 |  9 +++++----
 src/main.cpp                                    |  3 ++-
 src/metric/binary_metric.hpp                    |  8 ++++----
 src/metric/dcg_calculator.cpp                   |  5 +++--
 src/metric/map_metric.hpp                       | 10 +++++-----
 src/metric/multiclass_metric.hpp                |  6 +++---
 src/metric/rank_metric.hpp                      |  8 ++++----
 src/metric/regression_metric.hpp                |  6 +++---
 src/metric/xentropy_metric.hpp                  | 10 +++++-----
 src/network/linker_topo.cpp                     |  7 ++++---
 src/network/linkers.h                           | 10 +++++-----
 src/network/linkers_socket.cpp                  |  8 ++++----
 src/network/network.cpp                         |  7 ++++---
 src/network/socket_wrapper.hpp                  |  4 ++--
 src/objective/binary_objective.hpp              |  6 +++---
 src/objective/multiclass_objective.hpp          |  6 +++---
 src/objective/rank_objective.hpp                |  6 +++---
 src/objective/regression_objective.hpp          |  8 ++++----
 src/objective/xentropy_objective.hpp            |  8 ++++----
 src/treelearner/col_sampler.hpp                 |  3 +++
 .../cost_effective_gradient_boosting.hpp        |  4 ++--
 src/treelearner/data_partition.hpp              |  8 ++++----
 src/treelearner/feature_histogram.hpp           |  8 ++++----
 src/treelearner/gpu_tree_learner.cpp            |  4 ++--
 src/treelearner/gpu_tree_learner.h              | 12 ++++++------
 src/treelearner/leaf_splits.hpp                 |  4 ++--
 src/treelearner/parallel_tree_learner.h         |  6 +++---
 src/treelearner/serial_tree_learner.cpp         | 10 +++++-----
 src/treelearner/serial_tree_learner.h           | 14 +++++++-------
 src/treelearner/split_info.hpp                  |  4 ++--
 .../voting_parallel_tree_learner.cpp            |  3 ++-
 77 files changed, 299 insertions(+), 270 deletions(-)

diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp
index f3165e1fa1a..14609272fa3 100644
--- a/R-package/src/lightgbm_R.cpp
+++ b/R-package/src/lightgbm_R.cpp
@@ -5,13 +5,6 @@
 
 #include "lightgbm_R.h"
 
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-#include <LightGBM/utils/text_reader.h>
-
-#include <R_ext/Rdynload.h>
-
 #include <string>
 #include <cstdio>
 #include <cstring>
@@ -19,6 +12,13 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/text_reader.h>
+
+#include <R_ext/Rdynload.h>
+
 #define COL_MAJOR (0)
 
 #define R_API_BEGIN() \
diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h
index 911dedd7d94..53f9732edea 100644
--- a/include/LightGBM/application.h
+++ b/include/LightGBM/application.h
@@ -5,12 +5,12 @@
 #ifndef LIGHTGBM_APPLICATION_H_
 #define LIGHTGBM_APPLICATION_H_
 
-#include <LightGBM/config.h>
-#include <LightGBM/meta.h>
-
 #include <memory>
 #include <vector>
 
+#include <LightGBM/config.h>
+#include <LightGBM/meta.h>
+
 namespace LightGBM {
 
 class DatasetLoader;
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 96ae6a8d641..fab69d9ba89 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -5,10 +5,6 @@
 #ifndef LIGHTGBM_BIN_H_
 #define LIGHTGBM_BIN_H_
 
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/file_io.h>
-
 #include <limits>
 #include <string>
 #include <functional>
@@ -16,6 +12,10 @@
 #include <unordered_map>
 #include <vector>
 
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/file_io.h>
+
 namespace LightGBM {
 
 enum BinType {
diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h
index 31bb430f0ae..f456d798977 100644
--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_BOOSTING_H_
 #define LIGHTGBM_BOOSTING_H_
 
-#include <LightGBM/config.h>
-#include <LightGBM/meta.h>
-
 #include <string>
 #include <map>
 #include <unordered_map>
 #include <vector>
 
+#include <LightGBM/config.h>
+#include <LightGBM/meta.h>
+
 namespace LightGBM {
 
 /*! \brief forward declaration */
diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
index 3fbccdac075..553982eefed 100644
--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -13,12 +13,12 @@
 #ifndef LIGHTGBM_C_API_H_
 #define LIGHTGBM_C_API_H_
 
-#include <LightGBM/export.h>
-
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 
+#include <LightGBM/export.h>
+
 
 typedef void* DatasetHandle;  /*!< \brief Handle of dataset. */
 typedef void* BoosterHandle;  /*!< \brief Handle of booster. */
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 162c7583dc7..bbb62727623 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -11,11 +11,6 @@
 #ifndef LIGHTGBM_CONFIG_H_
 #define LIGHTGBM_CONFIG_H_
 
-#include <LightGBM/export.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
-
 #include <string>
 #include <algorithm>
 #include <memory>
@@ -23,6 +18,11 @@
 #include <unordered_set>
 #include <vector>
 
+#include <LightGBM/export.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 
 /*! \brief Types of tasks */
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index e4c5dc56511..bd0143daffd 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -5,13 +5,6 @@
 #ifndef LIGHTGBM_DATASET_H_
 #define LIGHTGBM_DATASET_H_
 
-#include <LightGBM/config.h>
-#include <LightGBM/feature_group.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-#include <LightGBM/utils/random.h>
-#include <LightGBM/utils/text_reader.h>
-
 #include <string>
 #include <functional>
 #include <memory>
@@ -20,6 +13,13 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/config.h>
+#include <LightGBM/feature_group.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/random.h>
+#include <LightGBM/utils/text_reader.h>
+
 namespace LightGBM {
 
 /*! \brief forward declaration */
diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h
index 8d5b20b481f..88443d62472 100644
--- a/include/LightGBM/dataset_loader.h
+++ b/include/LightGBM/dataset_loader.h
@@ -5,12 +5,12 @@
 #ifndef LIGHTGBM_DATASET_LOADER_H_
 #define LIGHTGBM_DATASET_LOADER_H_
 
-#include <LightGBM/dataset.h>
-
 #include <string>
 #include <unordered_set>
 #include <vector>
 
+#include <LightGBM/dataset.h>
+
 namespace LightGBM {
 
 class DatasetLoader {
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index d949beec20e..c21ad33b6a4 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_FEATURE_GROUP_H_
 #define LIGHTGBM_FEATURE_GROUP_H_
 
-#include <LightGBM/bin.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/random.h>
-
 #include <cstdio>
 #include <memory>
 #include <vector>
 
+#include <LightGBM/bin.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/random.h>
+
 namespace LightGBM {
 
 class Dataset;
diff --git a/include/LightGBM/metric.h b/include/LightGBM/metric.h
index 61d9fc99ea8..56fec3aad77 100644
--- a/include/LightGBM/metric.h
+++ b/include/LightGBM/metric.h
@@ -5,6 +5,9 @@
 #ifndef LIGHTGBM_METRIC_H_
 #define LIGHTGBM_METRIC_H_
 
+#include <string>
+#include <vector>
+
 #include <LightGBM/config.h>
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
@@ -12,9 +15,6 @@
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/common.h>
 
-#include <string>
-#include <vector>
-
 namespace LightGBM {
 
 /*!
diff --git a/include/LightGBM/network.h b/include/LightGBM/network.h
index 32c24fe6984..40373508eb5 100644
--- a/include/LightGBM/network.h
+++ b/include/LightGBM/network.h
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_NETWORK_H_
 #define LIGHTGBM_NETWORK_H_
 
-#include <LightGBM/config.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/log.h>
-
 #include <functional>
 #include <memory>
 #include <vector>
 
+#include <LightGBM/config.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 
 /*! \brief forward declaration */
diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h
index 5ea838dece2..76b3f7145ef 100644
--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_OBJECTIVE_FUNCTION_H_
 #define LIGHTGBM_OBJECTIVE_FUNCTION_H_
 
+#include <string>
+#include <functional>
+
 #include <LightGBM/config.h>
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
 
-#include <string>
-#include <functional>
-
 namespace LightGBM {
 /*!
 * \brief The interface of Objective Function.
diff --git a/include/LightGBM/prediction_early_stop.h b/include/LightGBM/prediction_early_stop.h
index 1d3e510981f..40db533325f 100644
--- a/include/LightGBM/prediction_early_stop.h
+++ b/include/LightGBM/prediction_early_stop.h
@@ -5,11 +5,11 @@
 #ifndef LIGHTGBM_PREDICTION_EARLY_STOP_H_
 #define LIGHTGBM_PREDICTION_EARLY_STOP_H_
 
-#include <LightGBM/export.h>
-
 #include <string>
 #include <functional>
 
+#include <LightGBM/export.h>
+
 namespace LightGBM {
 
 struct PredictionEarlyStopInstance {
diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h
index 5ce3ff9b3eb..f370bc74213 100644
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_TREE_H_
 #define LIGHTGBM_TREE_H_
 
-#include <LightGBM/dataset.h>
-#include <LightGBM/meta.h>
-
 #include <string>
 #include <map>
 #include <memory>
 #include <unordered_map>
 #include <vector>
 
+#include <LightGBM/dataset.h>
+#include <LightGBM/meta.h>
+
 namespace LightGBM {
 
 #define kCategoricalMask (1)
diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index 6c549a5ed71..cdb3d2823b8 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_TREE_LEARNER_H_
 #define LIGHTGBM_TREE_LEARNER_H_
 
+#include <string>
+#include <vector>
+
 #include <LightGBM/config.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/utils/json11.h>
 
-#include <string>
-#include <vector>
-
 namespace LightGBM {
 
 using json11::Json;
diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h
index 0183ecc22dd..a071247fb28 100644
--- a/include/LightGBM/utils/array_args.h
+++ b/include/LightGBM/utils/array_args.h
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_UTILS_ARRAY_AGRS_H_
 #define LIGHTGBM_UTILS_ARRAY_AGRS_H_
 
-#include <LightGBM/utils/openmp_wrapper.h>
-#include <LightGBM/utils/threading.h>
-
 #include <algorithm>
 #include <utility>
 #include <vector>
 
+#include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/threading.h>
+
 namespace LightGBM {
 
 /*!
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index bdc769e5222..663ea1730d3 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -5,9 +5,6 @@
 #ifndef LIGHTGBM_UTILS_COMMON_FUN_H_
 #define LIGHTGBM_UTILS_COMMON_FUN_H_
 
-#include <LightGBM/utils/log.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-
 #include <limits>
 #include <string>
 #include <algorithm>
@@ -30,6 +27,8 @@
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
 #endif
+#include <LightGBM/utils/log.h>
+#include <LightGBM/utils/openmp_wrapper.h>
 
 #if defined(_MSC_VER)
 #include <malloc.h>
diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h
index 075c991371c..fdd4b3850fb 100644
--- a/include/LightGBM/utils/openmp_wrapper.h
+++ b/include/LightGBM/utils/openmp_wrapper.h
@@ -6,16 +6,16 @@
 #define LIGHTGBM_OPENMP_WRAPPER_H_
 #ifdef _OPENMP
 
-#include <LightGBM/utils/log.h>
-
-#include <omp.h>
-
 #include <exception>
 #include <memory>
 #include <mutex>
 #include <stdexcept>
 #include <vector>
 
+#include <omp.h>
+
+#include <LightGBM/utils/log.h>
+
 inline int OMP_NUM_THREADS() {
   int ret = 1;
 #pragma omp parallel
diff --git a/include/LightGBM/utils/pipeline_reader.h b/include/LightGBM/utils/pipeline_reader.h
index f02500c9751..4e07b8b3674 100644
--- a/include/LightGBM/utils/pipeline_reader.h
+++ b/include/LightGBM/utils/pipeline_reader.h
@@ -5,9 +5,6 @@
 #ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_
 #define LIGHTGBM_UTILS_PIPELINE_READER_H_
 
-#include <LightGBM/utils/file_io.h>
-#include <LightGBM/utils/log.h>
-
 #include <algorithm>
 #include <cstdio>
 #include <functional>
@@ -16,6 +13,9 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/utils/file_io.h>
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 
 /*!
diff --git a/include/LightGBM/utils/text_reader.h b/include/LightGBM/utils/text_reader.h
index 638bb268362..7aaf7f8153a 100644
--- a/include/LightGBM/utils/text_reader.h
+++ b/include/LightGBM/utils/text_reader.h
@@ -5,16 +5,16 @@
 #ifndef LIGHTGBM_UTILS_TEXT_READER_H_
 #define LIGHTGBM_UTILS_TEXT_READER_H_
 
-#include <LightGBM/utils/log.h>
-#include <LightGBM/utils/pipeline_reader.h>
-#include <LightGBM/utils/random.h>
-
 #include <string>
 #include <cstdio>
 #include <functional>
 #include <sstream>
 #include <vector>
 
+#include <LightGBM/utils/log.h>
+#include <LightGBM/utils/pipeline_reader.h>
+#include <LightGBM/utils/random.h>
+
 namespace LightGBM {
 
 const size_t kGbs = size_t(1024) * 1024 * 1024;
diff --git a/include/LightGBM/utils/threading.h b/include/LightGBM/utils/threading.h
index d293fc811eb..dcf4f7608af 100644
--- a/include/LightGBM/utils/threading.h
+++ b/include/LightGBM/utils/threading.h
@@ -6,14 +6,14 @@
 #ifndef LIGHTGBM_UTILS_THREADING_H_
 #define LIGHTGBM_UTILS_THREADING_H_
 
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-
 #include <algorithm>
 #include <functional>
 #include <vector>
 
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
 namespace LightGBM {
 
 class Threading {
diff --git a/src/application/application.cpp b/src/application/application.cpp
index 1b9eabf8a12..a46cf419c53 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -2,6 +2,15 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
+#include <string>
+#include <chrono>
+#include <cstdio>
+#include <ctime>
+#include <fstream>
+#include <sstream>
+#include <utility>
+
 #include <LightGBM/application.h>
 
 #include <LightGBM/boosting.h>
@@ -15,14 +24,6 @@
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/text_reader.h>
 
-#include <string>
-#include <chrono>
-#include <cstdio>
-#include <ctime>
-#include <fstream>
-#include <sstream>
-#include <utility>
-
 #include "predictor.hpp"
 
 #ifdef USE_CUDA
diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp
index 1c56cfa5eb2..ab775d42913 100644
--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -5,12 +5,6 @@
 #ifndef LIGHTGBM_PREDICTOR_HPP_
 #define LIGHTGBM_PREDICTOR_HPP_
 
-#include <LightGBM/boosting.h>
-#include <LightGBM/dataset.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-#include <LightGBM/utils/text_reader.h>
-
 #include <string>
 #include <cstdio>
 #include <cstring>
@@ -21,6 +15,12 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/boosting.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/text_reader.h>
+
 namespace LightGBM {
 
 /*!
diff --git a/src/boosting/dart.hpp b/src/boosting/dart.hpp
index e2481e79772..b9dca6a78f2 100644
--- a/src/boosting/dart.hpp
+++ b/src/boosting/dart.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_BOOSTING_DART_H_
 #define LIGHTGBM_BOOSTING_DART_H_
 
-#include <LightGBM/boosting.h>
-
 #include <string>
 #include <algorithm>
 #include <cstdio>
 #include <fstream>
 #include <vector>
 
+#include <LightGBM/boosting.h>
+
 #include "gbdt.h"
 #include "score_updater.hpp"
 
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 24264c3c175..6199d726df9 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -2,8 +2,13 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #include "gbdt.h"
 
+#include <chrono>
+#include <ctime>
+#include <sstream>
+
 #include <LightGBM/metric.h>
 #include <LightGBM/network.h>
 #include <LightGBM/objective_function.h>
@@ -11,10 +16,6 @@
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
-#include <chrono>
-#include <ctime>
-#include <sstream>
-
 namespace LightGBM {
 
 #ifdef USE_CUDA
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index d460894d44b..420d5479e04 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -5,12 +5,6 @@
 #ifndef LIGHTGBM_BOOSTING_GBDT_H_
 #define LIGHTGBM_BOOSTING_GBDT_H_
 
-#include <LightGBM/boosting.h>
-#include <LightGBM/objective_function.h>
-#include <LightGBM/prediction_early_stop.h>
-#include <LightGBM/utils/json11.h>
-#include <LightGBM/utils/threading.h>
-
 #include <string>
 #include <algorithm>
 #include <cstdio>
@@ -22,12 +16,18 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/boosting.h>
+#include <LightGBM/objective_function.h>
+#include <LightGBM/prediction_early_stop.h>
+#include <LightGBM/utils/json11.h>
+#include <LightGBM/utils/threading.h>
+
+#include "score_updater.hpp"
+
 #ifdef USE_CUDA
 #include <LightGBM/cuda/vector_cudahost.h> //LGBM_CUDA
 #endif
 
-#include "score_updater.hpp"
-
 namespace LightGBM {
 
 using json11::Json;
diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp
index 5ce26bca95c..9ac4b269ac1 100644
--- a/src/boosting/gbdt_model_text.cpp
+++ b/src/boosting/gbdt_model_text.cpp
@@ -2,16 +2,17 @@
  * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
+#include <string>
+#include <sstream>
+#include <vector>
+
 #include <LightGBM/config.h>
 #include <LightGBM/metric.h>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/common.h>
 
-#include <string>
-#include <sstream>
-#include <vector>
-
 #include "gbdt.h"
 
 namespace LightGBM {
diff --git a/src/boosting/gbdt_prediction.cpp b/src/boosting/gbdt_prediction.cpp
index b4711f7c01a..e906bc0aaca 100644
--- a/src/boosting/gbdt_prediction.cpp
+++ b/src/boosting/gbdt_prediction.cpp
@@ -2,12 +2,12 @@
  * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+#include "gbdt.h"
+
 #include <LightGBM/objective_function.h>
 #include <LightGBM/prediction_early_stop.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
-#include "gbdt.h"
-
 namespace LightGBM {
 
 void GBDT::PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp
index 2af6dee14f6..d3a3c6a344c 100644
--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -5,10 +5,6 @@
 #ifndef LIGHTGBM_BOOSTING_GOSS_H_
 #define LIGHTGBM_BOOSTING_GOSS_H_
 
-#include <LightGBM/boosting.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/log.h>
-
 #include <string>
 #include <algorithm>
 #include <chrono>
@@ -16,6 +12,10 @@
 #include <fstream>
 #include <vector>
 
+#include <LightGBM/boosting.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/log.h>
+
 #include "gbdt.h"
 #include "score_updater.hpp"
 
diff --git a/src/boosting/prediction_early_stop.cpp b/src/boosting/prediction_early_stop.cpp
index 7e21141f685..7eda08f00d6 100644
--- a/src/boosting/prediction_early_stop.cpp
+++ b/src/boosting/prediction_early_stop.cpp
@@ -2,15 +2,16 @@
  * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include <LightGBM/prediction_early_stop.h>
-
-#include <LightGBM/utils/log.h>
 
 #include <limits>
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
+#include <LightGBM/prediction_early_stop.h>
+
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 
 PredictionEarlyStopInstance CreateNone(const PredictionEarlyStopConfig&) {
diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp
index e64bf6cb4d8..8bbc1a3ebee 100644
--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -5,9 +5,6 @@
 #ifndef LIGHTGBM_BOOSTING_RF_H_
 #define LIGHTGBM_BOOSTING_RF_H_
 
-#include <LightGBM/boosting.h>
-#include <LightGBM/metric.h>
-
 #include <string>
 #include <cstdio>
 #include <fstream>
@@ -15,6 +12,9 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/boosting.h>
+#include <LightGBM/metric.h>
+
 #include "gbdt.h"
 #include "score_updater.hpp"
 
diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp
index 7446691a470..231de245068 100644
--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
 #define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
 
+#include <cstring>
+#include <vector>
+
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/tree.h>
 #include <LightGBM/tree_learner.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
-#include <cstring>
-#include <vector>
-
 namespace LightGBM {
 /*!
 * \brief Used to store and update score for data
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 54fabba1665..979ab104b74 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -2,6 +2,15 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
+#include <string>
+#include <cstdio>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <vector>
+
 #include <LightGBM/c_api.h>
 
 #include <LightGBM/boosting.h>
@@ -18,10 +27,6 @@
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/threading.h>
 
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#endif
-
 #include <string>
 #include <cstdio>
 #include <functional>
@@ -32,6 +37,10 @@
 
 #include "application/predictor.hpp"
 
+#ifdef USE_CUDA
+#include <LightGBM/cuda/vector_cudahost.h>
+#endif
+
 namespace LightGBM {
 
 inline int LGBM_APIHandleException(const std::exception& ex) {
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 367edaa3f7b..9ead232fda0 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -2,17 +2,18 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include <LightGBM/bin.h>
-
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/file_io.h>
 
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <cstring>
 
+#include <LightGBM/bin.h>
+
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/file_io.h>
+
 #include "dense_bin.hpp"
 #include "multi_val_dense_bin.hpp"
 #include "multi_val_sparse_bin.hpp"
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 18c0562a676..4e4d8dbc794 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -2,14 +2,15 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
+#include <limits>
+
 #include <LightGBM/config.h>
 
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/random.h>
 
-#include <limits>
-
 namespace LightGBM {
 
 void Config::KV2Map(std::unordered_map<std::string, std::string>* params, const char* kv) {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index edae575f345..1001a9432ce 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -3,12 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for
  * license information.
  */
-#include <LightGBM/dataset.h>
-
-#include <LightGBM/feature_group.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-#include <LightGBM/utils/threading.h>
 
 #ifdef USE_CUDA
 #include <LightGBM/cuda/vector_cudahost.h>
@@ -20,6 +14,13 @@
 #include <sstream>
 #include <unordered_map>
 
+#include <LightGBM/dataset.h>
+
+#include <LightGBM/feature_group.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/threading.h>
+
 namespace LightGBM {
 
 const char* Dataset::binary_file_token =
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index c0b2edf1a8c..33ce1df7eb0 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,6 +2,9 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
+#include <fstream>
+
 #include <LightGBM/dataset_loader.h>
 
 #include <LightGBM/network.h>
@@ -10,8 +13,6 @@
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
-#include <fstream>
-
 namespace LightGBM {
 
 using json11::Json;
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 99feadf9f7f..10d988b68e0 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -6,8 +6,6 @@
 #ifndef LIGHTGBM_IO_DENSE_BIN_HPP_
 #define LIGHTGBM_IO_DENSE_BIN_HPP_
 
-#include <LightGBM/bin.h>
-
 #include <cstdint>
 #include <cstring>
 #include <vector>
@@ -18,6 +16,8 @@
 
 #include <LightGBM/utils/openmp_wrapper.h> // LGBM_CUDA
 
+#include <LightGBM/bin.h>
+
 namespace LightGBM {
 
 template <typename VAL_T, bool IS_4BIT>
diff --git a/src/io/file_io.cpp b/src/io/file_io.cpp
index a205964287e..67a719de0a8 100644
--- a/src/io/file_io.cpp
+++ b/src/io/file_io.cpp
@@ -3,14 +3,15 @@
  * Licensed under the MIT License. See LICENSE file in the project root for
  * license information.
  */
-#include <LightGBM/utils/file_io.h>
-
-#include <LightGBM/utils/log.h>
 
 #include <algorithm>
 #include <sstream>
 #include <unordered_map>
 
+#include <LightGBM/utils/file_io.h>
+
+#include <LightGBM/utils/log.h>
+
 #ifdef USE_HDFS
 #include <hdfs.h>
 #endif
diff --git a/src/io/json11.cpp b/src/io/json11.cpp
index db21c6aab54..a3fec7724b5 100644
--- a/src/io/json11.cpp
+++ b/src/io/json11.cpp
@@ -18,15 +18,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-#include <LightGBM/utils/json11.h>
-
-#include <LightGBM/utils/log.h>
 
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <limits>
 
+#include <LightGBM/utils/json11.h>
+
+#include <LightGBM/utils/log.h>
+
 namespace json11 {
 
 static const int max_depth = 200;
diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp
index ea0d5b08def..9b540045650 100644
--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@@ -2,12 +2,13 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include <LightGBM/dataset.h>
-#include <LightGBM/utils/common.h>
 
 #include <string>
 #include <vector>
 
+#include <LightGBM/dataset.h>
+#include <LightGBM/utils/common.h>
+
 namespace LightGBM {
 
 Metadata::Metadata() {
diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp
index 7645530d774..d7c6599c381 100644
--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
 #define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
 
-#include <LightGBM/bin.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <vector>
 
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
 namespace LightGBM {
 
 template <typename VAL_T>
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
index ec3f64a11a0..09c13420c66 100644
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
 #define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
 
-#include <LightGBM/bin.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <vector>
 
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
 namespace LightGBM {
 
 template <typename INDEX_T, typename VAL_T>
diff --git a/src/io/parser.cpp b/src/io/parser.cpp
index df14ea87a99..c30da4305f9 100644
--- a/src/io/parser.cpp
+++ b/src/io/parser.cpp
@@ -2,7 +2,6 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include "parser.hpp"
 
 #include <string>
 #include <algorithm>
@@ -11,6 +10,8 @@
 #include <iostream>
 #include <memory>
 
+#include "parser.hpp"
+
 namespace LightGBM {
 
 void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt) {
diff --git a/src/io/parser.hpp b/src/io/parser.hpp
index 1cfde0635a5..43764a68e75 100644
--- a/src/io/parser.hpp
+++ b/src/io/parser.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_IO_PARSER_HPP_
 #define LIGHTGBM_IO_PARSER_HPP_
 
-#include <LightGBM/dataset.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
-
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include <LightGBM/dataset.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 
 class CSVParser: public Parser {
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index c56cd6da99d..730ea161a2a 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -6,10 +6,6 @@
 #ifndef LIGHTGBM_IO_SPARSE_BIN_HPP_
 #define LIGHTGBM_IO_SPARSE_BIN_HPP_
 
-#include <LightGBM/bin.h>
-#include <LightGBM/utils/log.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
@@ -17,6 +13,10 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/log.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
 namespace LightGBM {
 
 template <typename VAL_T>
diff --git a/src/io/tree.cpp b/src/io/tree.cpp
index 63641311787..759d334ef98 100644
--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -2,16 +2,17 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
+#include <functional>
+#include <iomanip>
+#include <sstream>
+
 #include <LightGBM/tree.h>
 
 #include <LightGBM/dataset.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/threading.h>
 
-#include <functional>
-#include <iomanip>
-#include <sstream>
-
 namespace LightGBM {
 
 Tree::Tree(int max_leaves, bool track_branch_features)
diff --git a/src/main.cpp b/src/main.cpp
index ef277ac0c1f..0a8931ae4ff 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -2,10 +2,11 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include <LightGBM/application.h>
 
 #include <iostream>
 
+#include <LightGBM/application.h>
+
 #include "network/linkers.h"
 
 int main(int argc, char** argv) {
diff --git a/src/metric/binary_metric.hpp b/src/metric/binary_metric.hpp
index 00a51d6cd24..5bde453cdba 100644
--- a/src/metric/binary_metric.hpp
+++ b/src/metric/binary_metric.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_METRIC_BINARY_METRIC_HPP_
 #define LIGHTGBM_METRIC_BINARY_METRIC_HPP_
 
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
-
 #include <string>
 #include <algorithm>
 #include <sstream>
 #include <vector>
 
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 
 /*!
diff --git a/src/metric/dcg_calculator.cpp b/src/metric/dcg_calculator.cpp
index 58843d89f9e..cd477612bdc 100644
--- a/src/metric/dcg_calculator.cpp
+++ b/src/metric/dcg_calculator.cpp
@@ -2,13 +2,14 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/log.h>
 
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 
 /*! \brief Declaration for some static members */
diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp
index 18539ee44ee..b373a02b56f 100644
--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
@@ -5,16 +5,16 @@
 #ifndef LIGHTGBM_METRIC_MAP_METRIC_HPP_
 #define LIGHTGBM_METRIC_MAP_METRIC_HPP_
 
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-
 #include <string>
 #include <algorithm>
 #include <sstream>
 #include <vector>
 
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
 namespace LightGBM {
 
 class MapMetric:public Metric {
diff --git a/src/metric/multiclass_metric.hpp b/src/metric/multiclass_metric.hpp
index 59548cd3a79..f60588cac3f 100644
--- a/src/metric/multiclass_metric.hpp
+++ b/src/metric/multiclass_metric.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_METRIC_MULTICLASS_METRIC_HPP_
 #define LIGHTGBM_METRIC_MULTICLASS_METRIC_HPP_
 
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/log.h>
-
 #include <string>
 #include <cmath>
 #include <utility>
 #include <vector>
 
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 /*!
 * \brief Metric for multiclass task.
diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp
index 3b3afb547eb..d9227502009 100644
--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_METRIC_RANK_METRIC_HPP_
 #define LIGHTGBM_METRIC_RANK_METRIC_HPP_
 
+#include <string>
+#include <sstream>
+#include <vector>
+
 #include <LightGBM/metric.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
-#include <string>
-#include <sstream>
-#include <vector>
-
 namespace LightGBM {
 
 class NDCGMetric:public Metric {
diff --git a/src/metric/regression_metric.hpp b/src/metric/regression_metric.hpp
index 4d1a3662142..3085bc941b3 100644
--- a/src/metric/regression_metric.hpp
+++ b/src/metric/regression_metric.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_METRIC_REGRESSION_METRIC_HPP_
 #define LIGHTGBM_METRIC_REGRESSION_METRIC_HPP_
 
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/log.h>
-
 #include <string>
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 /*!
 * \brief Metric for regression task.
diff --git a/src/metric/xentropy_metric.hpp b/src/metric/xentropy_metric.hpp
index bec611d28e5..1b86e60e640 100644
--- a/src/metric/xentropy_metric.hpp
+++ b/src/metric/xentropy_metric.hpp
@@ -5,16 +5,16 @@
 #ifndef LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_
 #define LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_
 
-#include <LightGBM/meta.h>
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
-
 #include <string>
 #include <algorithm>
 #include <sstream>
 #include <vector>
 
+#include <LightGBM/meta.h>
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+
 /*
  * Implements three related metrics:
  *
diff --git a/src/network/linker_topo.cpp b/src/network/linker_topo.cpp
index 1d7b2990f0e..102fdc993cd 100644
--- a/src/network/linker_topo.cpp
+++ b/src/network/linker_topo.cpp
@@ -2,14 +2,15 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include <LightGBM/network.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
 
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include <LightGBM/network.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+
 namespace LightGBM {
 
 
diff --git a/src/network/linkers.h b/src/network/linkers.h
index 8ddbb902a15..5a91d40f73d 100644
--- a/src/network/linkers.h
+++ b/src/network/linkers.h
@@ -5,11 +5,6 @@
 #ifndef LIGHTGBM_NETWORK_LINKERS_H_
 #define LIGHTGBM_NETWORK_LINKERS_H_
 
-#include <LightGBM/config.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/network.h>
-#include <LightGBM/utils/common.h>
-
 #include <string>
 #include <algorithm>
 #include <chrono>
@@ -18,6 +13,11 @@
 #include <thread>
 #include <vector>
 
+#include <LightGBM/config.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/network.h>
+#include <LightGBM/utils/common.h>
+
 #ifdef USE_SOCKET
 #include "socket_wrapper.hpp"
 #endif
diff --git a/src/network/linkers_socket.cpp b/src/network/linkers_socket.cpp
index 40c6de7aab6..708e3ae9bad 100644
--- a/src/network/linkers_socket.cpp
+++ b/src/network/linkers_socket.cpp
@@ -4,10 +4,6 @@
  */
 #ifdef USE_SOCKET
 
-#include <LightGBM/config.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/text_reader.h>
-
 #include <string>
 #include <chrono>
 #include <cstring>
@@ -16,6 +12,10 @@
 #include <unordered_set>
 #include <vector>
 
+#include <LightGBM/config.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/text_reader.h>
+
 #include "linkers.h"
 
 namespace LightGBM {
diff --git a/src/network/network.cpp b/src/network/network.cpp
index 3976d72a692..cbc241d88c6 100644
--- a/src/network/network.cpp
+++ b/src/network/network.cpp
@@ -2,13 +2,14 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include <LightGBM/network.h>
-
-#include <LightGBM/utils/common.h>
 
 #include <cstdlib>
 #include <cstring>
 
+#include <LightGBM/network.h>
+
+#include <LightGBM/utils/common.h>
+
 #include "linkers.h"
 
 namespace LightGBM {
diff --git a/src/network/socket_wrapper.hpp b/src/network/socket_wrapper.hpp
index 70f9586b99c..87bd88f934a 100644
--- a/src/network/socket_wrapper.hpp
+++ b/src/network/socket_wrapper.hpp
@@ -6,13 +6,13 @@
 #define LIGHTGBM_NETWORK_SOCKET_WRAPPER_HPP_
 #ifdef USE_SOCKET
 
-#include <LightGBM/utils/log.h>
-
 #include <string>
 #include <cerrno>
 #include <cstdlib>
 #include <unordered_set>
 
+#include <LightGBM/utils/log.h>
+
 #if defined(_WIN32)
 
 #ifdef _MSC_VER
diff --git a/src/objective/binary_objective.hpp b/src/objective/binary_objective.hpp
index 4861bd1b83f..dff3a9c1097 100644
--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_OBJECTIVE_BINARY_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_BINARY_OBJECTIVE_HPP_
 
-#include <LightGBM/network.h>
-#include <LightGBM/objective_function.h>
-
 #include <string>
 #include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <vector>
 
+#include <LightGBM/network.h>
+#include <LightGBM/objective_function.h>
+
 namespace LightGBM {
 /*!
 * \brief Objective function for binary classification
diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp
index c133e1f75fd..da604baeb49 100644
--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
@@ -5,9 +5,6 @@
 #ifndef LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_
 
-#include <LightGBM/network.h>
-#include <LightGBM/objective_function.h>
-
 #include <string>
 #include <algorithm>
 #include <cmath>
@@ -15,6 +12,9 @@
 #include <memory>
 #include <vector>
 
+#include <LightGBM/network.h>
+#include <LightGBM/objective_function.h>
+
 #include "binary_objective.hpp"
 
 namespace LightGBM {
diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp
index 1b20721e238..730d358f030 100644
--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -6,9 +6,6 @@
 #ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_
 
-#include <LightGBM/metric.h>
-#include <LightGBM/objective_function.h>
-
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
@@ -17,6 +14,9 @@
 #include <string>
 #include <vector>
 
+#include <LightGBM/metric.h>
+#include <LightGBM/objective_function.h>
+
 namespace LightGBM {
 
 /*!
diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp
index 53fa7020be5..fe391f2c15f 100644
--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
 
-#include <LightGBM/meta.h>
-#include <LightGBM/objective_function.h>
-#include <LightGBM/utils/array_args.h>
-
 #include <string>
 #include <algorithm>
 #include <vector>
 
+#include <LightGBM/meta.h>
+#include <LightGBM/objective_function.h>
+#include <LightGBM/utils/array_args.h>
+
 namespace LightGBM {
 
 #define PercentileFun(T, data_reader, cnt_data, alpha)                    \
diff --git a/src/objective/xentropy_objective.hpp b/src/objective/xentropy_objective.hpp
index cad2b8faafd..3a145bd2cbb 100644
--- a/src/objective/xentropy_objective.hpp
+++ b/src/objective/xentropy_objective.hpp
@@ -5,16 +5,16 @@
 #ifndef LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_
 
-#include <LightGBM/meta.h>
-#include <LightGBM/objective_function.h>
-#include <LightGBM/utils/common.h>
-
 #include <string>
 #include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <vector>
 
+#include <LightGBM/meta.h>
+#include <LightGBM/objective_function.h>
+#include <LightGBM/utils/common.h>
+
 /*
  * Implements gradients and hessians for the following point losses.
  * Target y is anything in interval [0, 1].
diff --git a/src/treelearner/col_sampler.hpp b/src/treelearner/col_sampler.hpp
index cd288481255..4876827eb39 100644
--- a/src/treelearner/col_sampler.hpp
+++ b/src/treelearner/col_sampler.hpp
@@ -6,6 +6,9 @@
 #ifndef LIGHTGBM_TREELEARNER_COL_SAMPLER_HPP_
 #define LIGHTGBM_TREELEARNER_COL_SAMPLER_HPP_
 
+#include <algorithm>
+#include <vector>
+
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/utils/common.h>
diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp
index fad966e6487..fda2bffc820 100644
--- a/src/treelearner/cost_effective_gradient_boosting.hpp
+++ b/src/treelearner/cost_effective_gradient_boosting.hpp
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
 #define LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
 
+#include <vector>
+
 #include <LightGBM/config.h>
 #include <LightGBM/dataset.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 
-#include <vector>
-
 #include "data_partition.hpp"
 #include "serial_tree_learner.h"
 #include "split_info.hpp"
diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp
index 01c5d2606e7..bdae960c1d9 100644
--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
 #define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
 
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
 
-#include <algorithm>
-#include <cstring>
-#include <vector>
-
 namespace LightGBM {
 /*!
 * \brief DataPartition is used to store the the partition of data on tree.
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index 8916ee48fd4..4dd5f29af0a 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -6,10 +6,6 @@
 #ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
 #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
 
-#include <LightGBM/bin.h>
-#include <LightGBM/dataset.h>
-#include <LightGBM/utils/array_args.h>
-
 #include <algorithm>
 #include <cmath>
 #include <cstring>
@@ -17,6 +13,10 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/bin.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/utils/array_args.h>
+
 #include "monotone_constraints.hpp"
 #include "split_info.hpp"
 
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index fad02e1c044..42967e138c7 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -6,12 +6,12 @@
 
 #include "gpu_tree_learner.h"
 
+#include <algorithm>
+
 #include <LightGBM/bin.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 
-#include <algorithm>
-
 #include "../io/dense_bin.hpp"
 
 #define GPU_DEBUG 0
diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h
index 598e8d40ac9..c0607cbe4a2 100644
--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -5,12 +5,6 @@
 #ifndef LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_
 #define LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_
 
-#include <LightGBM/dataset.h>
-#include <LightGBM/feature_group.h>
-#include <LightGBM/tree.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/random.h>
-
 #include <string>
 #include <cmath>
 #include <cstdio>
@@ -18,6 +12,12 @@
 #include <random>
 #include <vector>
 
+#include <LightGBM/dataset.h>
+#include <LightGBM/feature_group.h>
+#include <LightGBM/tree.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/random.h>
+
 #include "data_partition.hpp"
 #include "feature_histogram.hpp"
 #include "leaf_splits.hpp"
diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp
index b0a753eafe1..5c94846eb7b 100644
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -5,11 +5,11 @@
 #ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
 #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
 
-#include <LightGBM/meta.h>
-
 #include <limits>
 #include <vector>
 
+#include <LightGBM/meta.h>
+
 #include "data_partition.hpp"
 
 namespace LightGBM {
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 2fdf542d421..5fbad7352e0 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
 #define LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
 
-#include <LightGBM/network.h>
-#include <LightGBM/utils/array_args.h>
-
 #include <cstring>
 #include <memory>
 #include <vector>
 
+#include <LightGBM/network.h>
+#include <LightGBM/utils/array_args.h>
+
 #include "gpu_tree_learner.h"
 #include "serial_tree_learner.h"
 #include "cuda_tree_learner.h"
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index e5b6626a6bd..9aac1bdf1fd 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -4,16 +4,16 @@
  */
 #include "serial_tree_learner.h"
 
-#include <LightGBM/network.h>
-#include <LightGBM/objective_function.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/common.h>
-
 #include <algorithm>
 #include <queue>
 #include <unordered_map>
 #include <utility>
 
+#include <LightGBM/network.h>
+#include <LightGBM/objective_function.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/common.h>
+
 #include "cost_effective_gradient_boosting.hpp"
 
 namespace LightGBM {
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 367c262192c..6fc0c2cb968 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -5,13 +5,6 @@
 #ifndef LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
 #define LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
 
-#include <LightGBM/dataset.h>
-#include <LightGBM/tree.h>
-#include <LightGBM/tree_learner.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/json11.h>
-#include <LightGBM/utils/random.h>
-
 #include <string>
 #include <cmath>
 #include <cstdio>
@@ -19,6 +12,13 @@
 #include <random>
 #include <vector>
 
+#include <LightGBM/dataset.h>
+#include <LightGBM/tree.h>
+#include <LightGBM/tree_learner.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/json11.h>
+#include <LightGBM/utils/random.h>
+
 #include "col_sampler.hpp"
 #include "data_partition.hpp"
 #include "feature_histogram.hpp"
diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp
index 492434d5160..72dd3fa324c 100644
--- a/src/treelearner/split_info.hpp
+++ b/src/treelearner/split_info.hpp
@@ -5,8 +5,6 @@
 #ifndef LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
 #define LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
 
-#include <LightGBM/meta.h>
-
 #include <limits>
 #include <cmath>
 #include <cstdint>
@@ -14,6 +12,8 @@
 #include <functional>
 #include <vector>
 
+#include <LightGBM/meta.h>
+
 namespace LightGBM {
 
 /*!
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 58f5b88d6b0..043bf0e11b3 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -2,12 +2,13 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include <LightGBM/utils/common.h>
 
 #include <cstring>
 #include <tuple>
 #include <vector>
 
+#include <LightGBM/utils/common.h>
+
 #include "parallel_tree_learner.h"
 
 namespace LightGBM {

From 7a6bbb5bd37f7baaf0d929f45f2e5989f786dcfa Mon Sep 17 00:00:00 2001
From: Nikita Titov <nekit94-08@mail.ru>
Date: Fri, 5 Jun 2020 18:53:07 +0300
Subject: [PATCH 066/119] Revert "re-order includes (fixes #3132) (#3133)"
 (#3153)

This reverts commit 656d2676c2174781c91747ba158cb6d27f4cacbd.
---
 R-package/src/lightgbm_R.cpp                   | 14 +++++++-------
 include/LightGBM/application.h                 |  6 +++---
 include/LightGBM/bin.h                         |  8 ++++----
 include/LightGBM/boosting.h                    |  6 +++---
 include/LightGBM/c_api.h                       |  4 ++--
 include/LightGBM/config.h                      | 10 +++++-----
 include/LightGBM/dataset.h                     | 14 +++++++-------
 include/LightGBM/dataset_loader.h              |  4 ++--
 include/LightGBM/feature_group.h               |  8 ++++----
 include/LightGBM/metric.h                      |  6 +++---
 include/LightGBM/network.h                     |  8 ++++----
 include/LightGBM/objective_function.h          |  6 +++---
 include/LightGBM/prediction_early_stop.h       |  4 ++--
 include/LightGBM/tree.h                        |  6 +++---
 include/LightGBM/tree_learner.h                |  6 +++---
 include/LightGBM/utils/array_args.h            |  6 +++---
 include/LightGBM/utils/common.h                |  3 +++
 include/LightGBM/utils/openmp_wrapper.h        |  8 ++++----
 include/LightGBM/utils/pipeline_reader.h       |  6 +++---
 include/LightGBM/utils/text_reader.h           |  8 ++++----
 include/LightGBM/utils/threading.h             |  8 ++++----
 src/application/application.cpp                | 17 ++++++++---------
 src/application/predictor.hpp                  | 12 ++++++------
 src/boosting/dart.hpp                          |  4 ++--
 src/boosting/gbdt.cpp                          |  9 ++++-----
 src/boosting/gbdt.h                            | 12 ++++++------
 src/boosting/gbdt_model_text.cpp               |  9 ++++-----
 src/boosting/gbdt_prediction.cpp               |  4 ++--
 src/boosting/goss.hpp                          |  8 ++++----
 src/boosting/prediction_early_stop.cpp         |  7 +++----
 src/boosting/rf.hpp                            |  6 +++---
 src/boosting/score_updater.hpp                 |  6 +++---
 src/c_api.cpp                                  |  9 ---------
 src/io/bin.cpp                                 | 11 +++++------
 src/io/config.cpp                              |  5 ++---
 src/io/dataset.cpp                             | 18 +++++++++---------
 src/io/dataset_loader.cpp                      |  5 ++---
 src/io/dense_bin.hpp                           |  4 ++--
 src/io/file_io.cpp                             |  7 +++----
 src/io/json11.cpp                              |  7 +++----
 src/io/metadata.cpp                            |  5 ++---
 src/io/multi_val_dense_bin.hpp                 |  6 +++---
 src/io/multi_val_sparse_bin.hpp                |  6 +++---
 src/io/parser.cpp                              |  3 +--
 src/io/parser.hpp                              |  8 ++++----
 src/io/sparse_bin.hpp                          |  8 ++++----
 src/io/tree.cpp                                |  9 ++++-----
 src/main.cpp                                   |  3 +--
 src/metric/binary_metric.hpp                   |  8 ++++----
 src/metric/dcg_calculator.cpp                  |  5 ++---
 src/metric/map_metric.hpp                      | 10 +++++-----
 src/metric/multiclass_metric.hpp               |  6 +++---
 src/metric/rank_metric.hpp                     |  8 ++++----
 src/metric/regression_metric.hpp               |  6 +++---
 src/metric/xentropy_metric.hpp                 | 10 +++++-----
 src/network/linker_topo.cpp                    |  7 +++----
 src/network/linkers.h                          | 10 +++++-----
 src/network/linkers_socket.cpp                 |  8 ++++----
 src/network/network.cpp                        |  7 +++----
 src/network/socket_wrapper.hpp                 |  4 ++--
 src/objective/binary_objective.hpp             |  6 +++---
 src/objective/multiclass_objective.hpp         |  6 +++---
 src/objective/rank_objective.hpp               |  6 +++---
 src/objective/regression_objective.hpp         |  8 ++++----
 src/objective/xentropy_objective.hpp           |  8 ++++----
 src/treelearner/col_sampler.hpp                |  3 ---
 .../cost_effective_gradient_boosting.hpp       |  4 ++--
 src/treelearner/data_partition.hpp             |  8 ++++----
 src/treelearner/feature_histogram.hpp          |  8 ++++----
 src/treelearner/gpu_tree_learner.cpp           |  4 ++--
 src/treelearner/gpu_tree_learner.h             | 12 ++++++------
 src/treelearner/leaf_splits.hpp                |  4 ++--
 src/treelearner/parallel_tree_learner.h        |  6 +++---
 src/treelearner/serial_tree_learner.cpp        | 10 +++++-----
 src/treelearner/serial_tree_learner.h          | 14 +++++++-------
 src/treelearner/split_info.hpp                 |  4 ++--
 .../voting_parallel_tree_learner.cpp           |  3 +--
 77 files changed, 267 insertions(+), 293 deletions(-)

diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp
index 14609272fa3..f3165e1fa1a 100644
--- a/R-package/src/lightgbm_R.cpp
+++ b/R-package/src/lightgbm_R.cpp
@@ -5,13 +5,6 @@
 
 #include "lightgbm_R.h"
 
-#include <string>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <utility>
-#include <vector>
-
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/openmp_wrapper.h>
@@ -19,6 +12,13 @@
 
 #include <R_ext/Rdynload.h>
 
+#include <string>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
+
 #define COL_MAJOR (0)
 
 #define R_API_BEGIN() \
diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h
index 53f9732edea..911dedd7d94 100644
--- a/include/LightGBM/application.h
+++ b/include/LightGBM/application.h
@@ -5,12 +5,12 @@
 #ifndef LIGHTGBM_APPLICATION_H_
 #define LIGHTGBM_APPLICATION_H_
 
-#include <memory>
-#include <vector>
-
 #include <LightGBM/config.h>
 #include <LightGBM/meta.h>
 
+#include <memory>
+#include <vector>
+
 namespace LightGBM {
 
 class DatasetLoader;
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index fab69d9ba89..96ae6a8d641 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -5,6 +5,10 @@
 #ifndef LIGHTGBM_BIN_H_
 #define LIGHTGBM_BIN_H_
 
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/file_io.h>
+
 #include <limits>
 #include <string>
 #include <functional>
@@ -12,10 +16,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/file_io.h>
-
 namespace LightGBM {
 
 enum BinType {
diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h
index f456d798977..31bb430f0ae 100644
--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_BOOSTING_H_
 #define LIGHTGBM_BOOSTING_H_
 
+#include <LightGBM/config.h>
+#include <LightGBM/meta.h>
+
 #include <string>
 #include <map>
 #include <unordered_map>
 #include <vector>
 
-#include <LightGBM/config.h>
-#include <LightGBM/meta.h>
-
 namespace LightGBM {
 
 /*! \brief forward declaration */
diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
index 553982eefed..3fbccdac075 100644
--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -13,12 +13,12 @@
 #ifndef LIGHTGBM_C_API_H_
 #define LIGHTGBM_C_API_H_
 
+#include <LightGBM/export.h>
+
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 
-#include <LightGBM/export.h>
-
 
 typedef void* DatasetHandle;  /*!< \brief Handle of dataset. */
 typedef void* BoosterHandle;  /*!< \brief Handle of booster. */
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index bbb62727623..162c7583dc7 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -11,6 +11,11 @@
 #ifndef LIGHTGBM_CONFIG_H_
 #define LIGHTGBM_CONFIG_H_
 
+#include <LightGBM/export.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+
 #include <string>
 #include <algorithm>
 #include <memory>
@@ -18,11 +23,6 @@
 #include <unordered_set>
 #include <vector>
 
-#include <LightGBM/export.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
-
 namespace LightGBM {
 
 /*! \brief Types of tasks */
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index bd0143daffd..e4c5dc56511 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -5,6 +5,13 @@
 #ifndef LIGHTGBM_DATASET_H_
 #define LIGHTGBM_DATASET_H_
 
+#include <LightGBM/config.h>
+#include <LightGBM/feature_group.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/random.h>
+#include <LightGBM/utils/text_reader.h>
+
 #include <string>
 #include <functional>
 #include <memory>
@@ -13,13 +20,6 @@
 #include <utility>
 #include <vector>
 
-#include <LightGBM/config.h>
-#include <LightGBM/feature_group.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-#include <LightGBM/utils/random.h>
-#include <LightGBM/utils/text_reader.h>
-
 namespace LightGBM {
 
 /*! \brief forward declaration */
diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h
index 88443d62472..8d5b20b481f 100644
--- a/include/LightGBM/dataset_loader.h
+++ b/include/LightGBM/dataset_loader.h
@@ -5,12 +5,12 @@
 #ifndef LIGHTGBM_DATASET_LOADER_H_
 #define LIGHTGBM_DATASET_LOADER_H_
 
+#include <LightGBM/dataset.h>
+
 #include <string>
 #include <unordered_set>
 #include <vector>
 
-#include <LightGBM/dataset.h>
-
 namespace LightGBM {
 
 class DatasetLoader {
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index c21ad33b6a4..d949beec20e 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_FEATURE_GROUP_H_
 #define LIGHTGBM_FEATURE_GROUP_H_
 
-#include <cstdio>
-#include <memory>
-#include <vector>
-
 #include <LightGBM/bin.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/utils/random.h>
 
+#include <cstdio>
+#include <memory>
+#include <vector>
+
 namespace LightGBM {
 
 class Dataset;
diff --git a/include/LightGBM/metric.h b/include/LightGBM/metric.h
index 56fec3aad77..61d9fc99ea8 100644
--- a/include/LightGBM/metric.h
+++ b/include/LightGBM/metric.h
@@ -5,9 +5,6 @@
 #ifndef LIGHTGBM_METRIC_H_
 #define LIGHTGBM_METRIC_H_
 
-#include <string>
-#include <vector>
-
 #include <LightGBM/config.h>
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
@@ -15,6 +12,9 @@
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/common.h>
 
+#include <string>
+#include <vector>
+
 namespace LightGBM {
 
 /*!
diff --git a/include/LightGBM/network.h b/include/LightGBM/network.h
index 40373508eb5..32c24fe6984 100644
--- a/include/LightGBM/network.h
+++ b/include/LightGBM/network.h
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_NETWORK_H_
 #define LIGHTGBM_NETWORK_H_
 
-#include <functional>
-#include <memory>
-#include <vector>
-
 #include <LightGBM/config.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/utils/log.h>
 
+#include <functional>
+#include <memory>
+#include <vector>
+
 namespace LightGBM {
 
 /*! \brief forward declaration */
diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h
index 76b3f7145ef..5ea838dece2 100644
--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_OBJECTIVE_FUNCTION_H_
 #define LIGHTGBM_OBJECTIVE_FUNCTION_H_
 
-#include <string>
-#include <functional>
-
 #include <LightGBM/config.h>
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
 
+#include <string>
+#include <functional>
+
 namespace LightGBM {
 /*!
 * \brief The interface of Objective Function.
diff --git a/include/LightGBM/prediction_early_stop.h b/include/LightGBM/prediction_early_stop.h
index 40db533325f..1d3e510981f 100644
--- a/include/LightGBM/prediction_early_stop.h
+++ b/include/LightGBM/prediction_early_stop.h
@@ -5,11 +5,11 @@
 #ifndef LIGHTGBM_PREDICTION_EARLY_STOP_H_
 #define LIGHTGBM_PREDICTION_EARLY_STOP_H_
 
+#include <LightGBM/export.h>
+
 #include <string>
 #include <functional>
 
-#include <LightGBM/export.h>
-
 namespace LightGBM {
 
 struct PredictionEarlyStopInstance {
diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h
index f370bc74213..5ce3ff9b3eb 100644
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_TREE_H_
 #define LIGHTGBM_TREE_H_
 
+#include <LightGBM/dataset.h>
+#include <LightGBM/meta.h>
+
 #include <string>
 #include <map>
 #include <memory>
 #include <unordered_map>
 #include <vector>
 
-#include <LightGBM/dataset.h>
-#include <LightGBM/meta.h>
-
 namespace LightGBM {
 
 #define kCategoricalMask (1)
diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index cdb3d2823b8..6c549a5ed71 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_TREE_LEARNER_H_
 #define LIGHTGBM_TREE_LEARNER_H_
 
-#include <string>
-#include <vector>
-
 #include <LightGBM/config.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/utils/json11.h>
 
+#include <string>
+#include <vector>
+
 namespace LightGBM {
 
 using json11::Json;
diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h
index a071247fb28..0183ecc22dd 100644
--- a/include/LightGBM/utils/array_args.h
+++ b/include/LightGBM/utils/array_args.h
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_UTILS_ARRAY_AGRS_H_
 #define LIGHTGBM_UTILS_ARRAY_AGRS_H_
 
+#include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/threading.h>
+
 #include <algorithm>
 #include <utility>
 #include <vector>
 
-#include <LightGBM/utils/openmp_wrapper.h>
-#include <LightGBM/utils/threading.h>
-
 namespace LightGBM {
 
 /*!
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index 663ea1730d3..0e26ee84faa 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -5,6 +5,9 @@
 #ifndef LIGHTGBM_UTILS_COMMON_FUN_H_
 #define LIGHTGBM_UTILS_COMMON_FUN_H_
 
+#include <LightGBM/utils/log.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
 #include <limits>
 #include <string>
 #include <algorithm>
diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h
index fdd4b3850fb..075c991371c 100644
--- a/include/LightGBM/utils/openmp_wrapper.h
+++ b/include/LightGBM/utils/openmp_wrapper.h
@@ -6,16 +6,16 @@
 #define LIGHTGBM_OPENMP_WRAPPER_H_
 #ifdef _OPENMP
 
+#include <LightGBM/utils/log.h>
+
+#include <omp.h>
+
 #include <exception>
 #include <memory>
 #include <mutex>
 #include <stdexcept>
 #include <vector>
 
-#include <omp.h>
-
-#include <LightGBM/utils/log.h>
-
 inline int OMP_NUM_THREADS() {
   int ret = 1;
 #pragma omp parallel
diff --git a/include/LightGBM/utils/pipeline_reader.h b/include/LightGBM/utils/pipeline_reader.h
index 4e07b8b3674..f02500c9751 100644
--- a/include/LightGBM/utils/pipeline_reader.h
+++ b/include/LightGBM/utils/pipeline_reader.h
@@ -5,6 +5,9 @@
 #ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_
 #define LIGHTGBM_UTILS_PIPELINE_READER_H_
 
+#include <LightGBM/utils/file_io.h>
+#include <LightGBM/utils/log.h>
+
 #include <algorithm>
 #include <cstdio>
 #include <functional>
@@ -13,9 +16,6 @@
 #include <utility>
 #include <vector>
 
-#include <LightGBM/utils/file_io.h>
-#include <LightGBM/utils/log.h>
-
 namespace LightGBM {
 
 /*!
diff --git a/include/LightGBM/utils/text_reader.h b/include/LightGBM/utils/text_reader.h
index 7aaf7f8153a..638bb268362 100644
--- a/include/LightGBM/utils/text_reader.h
+++ b/include/LightGBM/utils/text_reader.h
@@ -5,16 +5,16 @@
 #ifndef LIGHTGBM_UTILS_TEXT_READER_H_
 #define LIGHTGBM_UTILS_TEXT_READER_H_
 
+#include <LightGBM/utils/log.h>
+#include <LightGBM/utils/pipeline_reader.h>
+#include <LightGBM/utils/random.h>
+
 #include <string>
 #include <cstdio>
 #include <functional>
 #include <sstream>
 #include <vector>
 
-#include <LightGBM/utils/log.h>
-#include <LightGBM/utils/pipeline_reader.h>
-#include <LightGBM/utils/random.h>
-
 namespace LightGBM {
 
 const size_t kGbs = size_t(1024) * 1024 * 1024;
diff --git a/include/LightGBM/utils/threading.h b/include/LightGBM/utils/threading.h
index dcf4f7608af..d293fc811eb 100644
--- a/include/LightGBM/utils/threading.h
+++ b/include/LightGBM/utils/threading.h
@@ -6,14 +6,14 @@
 #ifndef LIGHTGBM_UTILS_THREADING_H_
 #define LIGHTGBM_UTILS_THREADING_H_
 
-#include <algorithm>
-#include <functional>
-#include <vector>
-
 #include <LightGBM/meta.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include <algorithm>
+#include <functional>
+#include <vector>
+
 namespace LightGBM {
 
 class Threading {
diff --git a/src/application/application.cpp b/src/application/application.cpp
index a46cf419c53..1b9eabf8a12 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -2,15 +2,6 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
-#include <string>
-#include <chrono>
-#include <cstdio>
-#include <ctime>
-#include <fstream>
-#include <sstream>
-#include <utility>
-
 #include <LightGBM/application.h>
 
 #include <LightGBM/boosting.h>
@@ -24,6 +15,14 @@
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/text_reader.h>
 
+#include <string>
+#include <chrono>
+#include <cstdio>
+#include <ctime>
+#include <fstream>
+#include <sstream>
+#include <utility>
+
 #include "predictor.hpp"
 
 #ifdef USE_CUDA
diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp
index ab775d42913..1c56cfa5eb2 100644
--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -5,6 +5,12 @@
 #ifndef LIGHTGBM_PREDICTOR_HPP_
 #define LIGHTGBM_PREDICTOR_HPP_
 
+#include <LightGBM/boosting.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/text_reader.h>
+
 #include <string>
 #include <cstdio>
 #include <cstring>
@@ -15,12 +21,6 @@
 #include <utility>
 #include <vector>
 
-#include <LightGBM/boosting.h>
-#include <LightGBM/dataset.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-#include <LightGBM/utils/text_reader.h>
-
 namespace LightGBM {
 
 /*!
diff --git a/src/boosting/dart.hpp b/src/boosting/dart.hpp
index b9dca6a78f2..e2481e79772 100644
--- a/src/boosting/dart.hpp
+++ b/src/boosting/dart.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_BOOSTING_DART_H_
 #define LIGHTGBM_BOOSTING_DART_H_
 
+#include <LightGBM/boosting.h>
+
 #include <string>
 #include <algorithm>
 #include <cstdio>
 #include <fstream>
 #include <vector>
 
-#include <LightGBM/boosting.h>
-
 #include "gbdt.h"
 #include "score_updater.hpp"
 
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 6199d726df9..24264c3c175 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -2,13 +2,8 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #include "gbdt.h"
 
-#include <chrono>
-#include <ctime>
-#include <sstream>
-
 #include <LightGBM/metric.h>
 #include <LightGBM/network.h>
 #include <LightGBM/objective_function.h>
@@ -16,6 +11,10 @@
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include <chrono>
+#include <ctime>
+#include <sstream>
+
 namespace LightGBM {
 
 #ifdef USE_CUDA
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 420d5479e04..02476f810a8 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -5,6 +5,12 @@
 #ifndef LIGHTGBM_BOOSTING_GBDT_H_
 #define LIGHTGBM_BOOSTING_GBDT_H_
 
+#include <LightGBM/boosting.h>
+#include <LightGBM/objective_function.h>
+#include <LightGBM/prediction_early_stop.h>
+#include <LightGBM/utils/json11.h>
+#include <LightGBM/utils/threading.h>
+
 #include <string>
 #include <algorithm>
 #include <cstdio>
@@ -16,12 +22,6 @@
 #include <utility>
 #include <vector>
 
-#include <LightGBM/boosting.h>
-#include <LightGBM/objective_function.h>
-#include <LightGBM/prediction_early_stop.h>
-#include <LightGBM/utils/json11.h>
-#include <LightGBM/utils/threading.h>
-
 #include "score_updater.hpp"
 
 #ifdef USE_CUDA
diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp
index 9ac4b269ac1..5ce26bca95c 100644
--- a/src/boosting/gbdt_model_text.cpp
+++ b/src/boosting/gbdt_model_text.cpp
@@ -2,17 +2,16 @@
  * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
-#include <string>
-#include <sstream>
-#include <vector>
-
 #include <LightGBM/config.h>
 #include <LightGBM/metric.h>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/common.h>
 
+#include <string>
+#include <sstream>
+#include <vector>
+
 #include "gbdt.h"
 
 namespace LightGBM {
diff --git a/src/boosting/gbdt_prediction.cpp b/src/boosting/gbdt_prediction.cpp
index e906bc0aaca..b4711f7c01a 100644
--- a/src/boosting/gbdt_prediction.cpp
+++ b/src/boosting/gbdt_prediction.cpp
@@ -2,12 +2,12 @@
  * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#include "gbdt.h"
-
 #include <LightGBM/objective_function.h>
 #include <LightGBM/prediction_early_stop.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include "gbdt.h"
+
 namespace LightGBM {
 
 void GBDT::PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp
index d3a3c6a344c..2af6dee14f6 100644
--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -5,6 +5,10 @@
 #ifndef LIGHTGBM_BOOSTING_GOSS_H_
 #define LIGHTGBM_BOOSTING_GOSS_H_
 
+#include <LightGBM/boosting.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/log.h>
+
 #include <string>
 #include <algorithm>
 #include <chrono>
@@ -12,10 +16,6 @@
 #include <fstream>
 #include <vector>
 
-#include <LightGBM/boosting.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/log.h>
-
 #include "gbdt.h"
 #include "score_updater.hpp"
 
diff --git a/src/boosting/prediction_early_stop.cpp b/src/boosting/prediction_early_stop.cpp
index 7eda08f00d6..7e21141f685 100644
--- a/src/boosting/prediction_early_stop.cpp
+++ b/src/boosting/prediction_early_stop.cpp
@@ -2,16 +2,15 @@
  * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+#include <LightGBM/prediction_early_stop.h>
+
+#include <LightGBM/utils/log.h>
 
 #include <limits>
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
-#include <LightGBM/prediction_early_stop.h>
-
-#include <LightGBM/utils/log.h>
-
 namespace LightGBM {
 
 PredictionEarlyStopInstance CreateNone(const PredictionEarlyStopConfig&) {
diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp
index 8bbc1a3ebee..e64bf6cb4d8 100644
--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -5,6 +5,9 @@
 #ifndef LIGHTGBM_BOOSTING_RF_H_
 #define LIGHTGBM_BOOSTING_RF_H_
 
+#include <LightGBM/boosting.h>
+#include <LightGBM/metric.h>
+
 #include <string>
 #include <cstdio>
 #include <fstream>
@@ -12,9 +15,6 @@
 #include <utility>
 #include <vector>
 
-#include <LightGBM/boosting.h>
-#include <LightGBM/metric.h>
-
 #include "gbdt.h"
 #include "score_updater.hpp"
 
diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp
index 231de245068..7446691a470 100644
--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
 #define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
 
-#include <cstring>
-#include <vector>
-
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/tree.h>
 #include <LightGBM/tree_learner.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include <cstring>
+#include <vector>
+
 namespace LightGBM {
 /*!
 * \brief Used to store and update score for data
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 979ab104b74..4820690fd7e 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -2,15 +2,6 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
-#include <string>
-#include <cstdio>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <stdexcept>
-#include <vector>
-
 #include <LightGBM/c_api.h>
 
 #include <LightGBM/boosting.h>
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 9ead232fda0..367edaa3f7b 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -2,18 +2,17 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-
 #include <LightGBM/bin.h>
 
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/file_io.h>
 
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+
 #include "dense_bin.hpp"
 #include "multi_val_dense_bin.hpp"
 #include "multi_val_sparse_bin.hpp"
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 4e4d8dbc794..18c0562a676 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -2,15 +2,14 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
-#include <limits>
-
 #include <LightGBM/config.h>
 
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/random.h>
 
+#include <limits>
+
 namespace LightGBM {
 
 void Config::KV2Map(std::unordered_map<std::string, std::string>* params, const char* kv) {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 1001a9432ce..3d4e29be1fe 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -4,9 +4,12 @@
  * license information.
  */
 
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#endif
+#include <LightGBM/dataset.h>
+
+#include <LightGBM/feature_group.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/threading.h>
 
 #include <chrono>
 #include <cstdio>
@@ -14,12 +17,9 @@
 #include <sstream>
 #include <unordered_map>
 
-#include <LightGBM/dataset.h>
-
-#include <LightGBM/feature_group.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-#include <LightGBM/utils/threading.h>
+#ifdef USE_CUDA
+#include <LightGBM/cuda/vector_cudahost.h>
+#endif
 
 namespace LightGBM {
 
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 33ce1df7eb0..c0b2edf1a8c 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,9 +2,6 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
-#include <fstream>
-
 #include <LightGBM/dataset_loader.h>
 
 #include <LightGBM/network.h>
@@ -13,6 +10,8 @@
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include <fstream>
+
 namespace LightGBM {
 
 using json11::Json;
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 10d988b68e0..99feadf9f7f 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -6,6 +6,8 @@
 #ifndef LIGHTGBM_IO_DENSE_BIN_HPP_
 #define LIGHTGBM_IO_DENSE_BIN_HPP_
 
+#include <LightGBM/bin.h>
+
 #include <cstdint>
 #include <cstring>
 #include <vector>
@@ -16,8 +18,6 @@
 
 #include <LightGBM/utils/openmp_wrapper.h> // LGBM_CUDA
 
-#include <LightGBM/bin.h>
-
 namespace LightGBM {
 
 template <typename VAL_T, bool IS_4BIT>
diff --git a/src/io/file_io.cpp b/src/io/file_io.cpp
index 67a719de0a8..a205964287e 100644
--- a/src/io/file_io.cpp
+++ b/src/io/file_io.cpp
@@ -3,15 +3,14 @@
  * Licensed under the MIT License. See LICENSE file in the project root for
  * license information.
  */
+#include <LightGBM/utils/file_io.h>
+
+#include <LightGBM/utils/log.h>
 
 #include <algorithm>
 #include <sstream>
 #include <unordered_map>
 
-#include <LightGBM/utils/file_io.h>
-
-#include <LightGBM/utils/log.h>
-
 #ifdef USE_HDFS
 #include <hdfs.h>
 #endif
diff --git a/src/io/json11.cpp b/src/io/json11.cpp
index a3fec7724b5..db21c6aab54 100644
--- a/src/io/json11.cpp
+++ b/src/io/json11.cpp
@@ -18,16 +18,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include <LightGBM/utils/json11.h>
+
+#include <LightGBM/utils/log.h>
 
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <limits>
 
-#include <LightGBM/utils/json11.h>
-
-#include <LightGBM/utils/log.h>
-
 namespace json11 {
 
 static const int max_depth = 200;
diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp
index 9b540045650..ea0d5b08def 100644
--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@@ -2,13 +2,12 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+#include <LightGBM/dataset.h>
+#include <LightGBM/utils/common.h>
 
 #include <string>
 #include <vector>
 
-#include <LightGBM/dataset.h>
-#include <LightGBM/utils/common.h>
-
 namespace LightGBM {
 
 Metadata::Metadata() {
diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp
index d7c6599c381..7645530d774 100644
--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
 #define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
 
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <vector>
 
-#include <LightGBM/bin.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-
 namespace LightGBM {
 
 template <typename VAL_T>
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
index 09c13420c66..ec3f64a11a0 100644
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
 #define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
 
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <vector>
 
-#include <LightGBM/bin.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-
 namespace LightGBM {
 
 template <typename INDEX_T, typename VAL_T>
diff --git a/src/io/parser.cpp b/src/io/parser.cpp
index c30da4305f9..df14ea87a99 100644
--- a/src/io/parser.cpp
+++ b/src/io/parser.cpp
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+#include "parser.hpp"
 
 #include <string>
 #include <algorithm>
@@ -10,8 +11,6 @@
 #include <iostream>
 #include <memory>
 
-#include "parser.hpp"
-
 namespace LightGBM {
 
 void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt) {
diff --git a/src/io/parser.hpp b/src/io/parser.hpp
index 43764a68e75..1cfde0635a5 100644
--- a/src/io/parser.hpp
+++ b/src/io/parser.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_IO_PARSER_HPP_
 #define LIGHTGBM_IO_PARSER_HPP_
 
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
 #include <LightGBM/dataset.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
 namespace LightGBM {
 
 class CSVParser: public Parser {
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index 730ea161a2a..c56cd6da99d 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -6,6 +6,10 @@
 #ifndef LIGHTGBM_IO_SPARSE_BIN_HPP_
 #define LIGHTGBM_IO_SPARSE_BIN_HPP_
 
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/log.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
@@ -13,10 +17,6 @@
 #include <utility>
 #include <vector>
 
-#include <LightGBM/bin.h>
-#include <LightGBM/utils/log.h>
-#include <LightGBM/utils/openmp_wrapper.h>
-
 namespace LightGBM {
 
 template <typename VAL_T>
diff --git a/src/io/tree.cpp b/src/io/tree.cpp
index 759d334ef98..63641311787 100644
--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -2,17 +2,16 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
-#include <functional>
-#include <iomanip>
-#include <sstream>
-
 #include <LightGBM/tree.h>
 
 #include <LightGBM/dataset.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/threading.h>
 
+#include <functional>
+#include <iomanip>
+#include <sstream>
+
 namespace LightGBM {
 
 Tree::Tree(int max_leaves, bool track_branch_features)
diff --git a/src/main.cpp b/src/main.cpp
index 0a8931ae4ff..ef277ac0c1f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -2,11 +2,10 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+#include <LightGBM/application.h>
 
 #include <iostream>
 
-#include <LightGBM/application.h>
-
 #include "network/linkers.h"
 
 int main(int argc, char** argv) {
diff --git a/src/metric/binary_metric.hpp b/src/metric/binary_metric.hpp
index 5bde453cdba..00a51d6cd24 100644
--- a/src/metric/binary_metric.hpp
+++ b/src/metric/binary_metric.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_METRIC_BINARY_METRIC_HPP_
 #define LIGHTGBM_METRIC_BINARY_METRIC_HPP_
 
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
+
 #include <string>
 #include <algorithm>
 #include <sstream>
 #include <vector>
 
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
-
 namespace LightGBM {
 
 /*!
diff --git a/src/metric/dcg_calculator.cpp b/src/metric/dcg_calculator.cpp
index cd477612bdc..58843d89f9e 100644
--- a/src/metric/dcg_calculator.cpp
+++ b/src/metric/dcg_calculator.cpp
@@ -2,14 +2,13 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/log.h>
 
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/log.h>
-
 namespace LightGBM {
 
 /*! \brief Declaration for some static members */
diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp
index b373a02b56f..18539ee44ee 100644
--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
@@ -5,16 +5,16 @@
 #ifndef LIGHTGBM_METRIC_MAP_METRIC_HPP_
 #define LIGHTGBM_METRIC_MAP_METRIC_HPP_
 
-#include <string>
-#include <algorithm>
-#include <sstream>
-#include <vector>
-
 #include <LightGBM/metric.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include <string>
+#include <algorithm>
+#include <sstream>
+#include <vector>
+
 namespace LightGBM {
 
 class MapMetric:public Metric {
diff --git a/src/metric/multiclass_metric.hpp b/src/metric/multiclass_metric.hpp
index f60588cac3f..59548cd3a79 100644
--- a/src/metric/multiclass_metric.hpp
+++ b/src/metric/multiclass_metric.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_METRIC_MULTICLASS_METRIC_HPP_
 #define LIGHTGBM_METRIC_MULTICLASS_METRIC_HPP_
 
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/log.h>
+
 #include <string>
 #include <cmath>
 #include <utility>
 #include <vector>
 
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/log.h>
-
 namespace LightGBM {
 /*!
 * \brief Metric for multiclass task.
diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp
index d9227502009..3b3afb547eb 100644
--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_METRIC_RANK_METRIC_HPP_
 #define LIGHTGBM_METRIC_RANK_METRIC_HPP_
 
-#include <string>
-#include <sstream>
-#include <vector>
-
 #include <LightGBM/metric.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include <string>
+#include <sstream>
+#include <vector>
+
 namespace LightGBM {
 
 class NDCGMetric:public Metric {
diff --git a/src/metric/regression_metric.hpp b/src/metric/regression_metric.hpp
index 3085bc941b3..4d1a3662142 100644
--- a/src/metric/regression_metric.hpp
+++ b/src/metric/regression_metric.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_METRIC_REGRESSION_METRIC_HPP_
 #define LIGHTGBM_METRIC_REGRESSION_METRIC_HPP_
 
+#include <LightGBM/metric.h>
+#include <LightGBM/utils/log.h>
+
 #include <string>
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
-#include <LightGBM/metric.h>
-#include <LightGBM/utils/log.h>
-
 namespace LightGBM {
 /*!
 * \brief Metric for regression task.
diff --git a/src/metric/xentropy_metric.hpp b/src/metric/xentropy_metric.hpp
index 1b86e60e640..bec611d28e5 100644
--- a/src/metric/xentropy_metric.hpp
+++ b/src/metric/xentropy_metric.hpp
@@ -5,16 +5,16 @@
 #ifndef LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_
 #define LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_
 
-#include <string>
-#include <algorithm>
-#include <sstream>
-#include <vector>
-
 #include <LightGBM/meta.h>
 #include <LightGBM/metric.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 
+#include <string>
+#include <algorithm>
+#include <sstream>
+#include <vector>
+
 /*
  * Implements three related metrics:
  *
diff --git a/src/network/linker_topo.cpp b/src/network/linker_topo.cpp
index 102fdc993cd..1d7b2990f0e 100644
--- a/src/network/linker_topo.cpp
+++ b/src/network/linker_topo.cpp
@@ -2,15 +2,14 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+#include <LightGBM/network.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/log.h>
 
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include <LightGBM/network.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/log.h>
-
 namespace LightGBM {
 
 
diff --git a/src/network/linkers.h b/src/network/linkers.h
index 5a91d40f73d..8ddbb902a15 100644
--- a/src/network/linkers.h
+++ b/src/network/linkers.h
@@ -5,6 +5,11 @@
 #ifndef LIGHTGBM_NETWORK_LINKERS_H_
 #define LIGHTGBM_NETWORK_LINKERS_H_
 
+#include <LightGBM/config.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/network.h>
+#include <LightGBM/utils/common.h>
+
 #include <string>
 #include <algorithm>
 #include <chrono>
@@ -13,11 +18,6 @@
 #include <thread>
 #include <vector>
 
-#include <LightGBM/config.h>
-#include <LightGBM/meta.h>
-#include <LightGBM/network.h>
-#include <LightGBM/utils/common.h>
-
 #ifdef USE_SOCKET
 #include "socket_wrapper.hpp"
 #endif
diff --git a/src/network/linkers_socket.cpp b/src/network/linkers_socket.cpp
index 708e3ae9bad..40c6de7aab6 100644
--- a/src/network/linkers_socket.cpp
+++ b/src/network/linkers_socket.cpp
@@ -4,6 +4,10 @@
  */
 #ifdef USE_SOCKET
 
+#include <LightGBM/config.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/text_reader.h>
+
 #include <string>
 #include <chrono>
 #include <cstring>
@@ -12,10 +16,6 @@
 #include <unordered_set>
 #include <vector>
 
-#include <LightGBM/config.h>
-#include <LightGBM/utils/common.h>
-#include <LightGBM/utils/text_reader.h>
-
 #include "linkers.h"
 
 namespace LightGBM {
diff --git a/src/network/network.cpp b/src/network/network.cpp
index cbc241d88c6..3976d72a692 100644
--- a/src/network/network.cpp
+++ b/src/network/network.cpp
@@ -2,14 +2,13 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
-#include <cstdlib>
-#include <cstring>
-
 #include <LightGBM/network.h>
 
 #include <LightGBM/utils/common.h>
 
+#include <cstdlib>
+#include <cstring>
+
 #include "linkers.h"
 
 namespace LightGBM {
diff --git a/src/network/socket_wrapper.hpp b/src/network/socket_wrapper.hpp
index 87bd88f934a..70f9586b99c 100644
--- a/src/network/socket_wrapper.hpp
+++ b/src/network/socket_wrapper.hpp
@@ -6,13 +6,13 @@
 #define LIGHTGBM_NETWORK_SOCKET_WRAPPER_HPP_
 #ifdef USE_SOCKET
 
+#include <LightGBM/utils/log.h>
+
 #include <string>
 #include <cerrno>
 #include <cstdlib>
 #include <unordered_set>
 
-#include <LightGBM/utils/log.h>
-
 #if defined(_WIN32)
 
 #ifdef _MSC_VER
diff --git a/src/objective/binary_objective.hpp b/src/objective/binary_objective.hpp
index dff3a9c1097..4861bd1b83f 100644
--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_OBJECTIVE_BINARY_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_BINARY_OBJECTIVE_HPP_
 
+#include <LightGBM/network.h>
+#include <LightGBM/objective_function.h>
+
 #include <string>
 #include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <vector>
 
-#include <LightGBM/network.h>
-#include <LightGBM/objective_function.h>
-
 namespace LightGBM {
 /*!
 * \brief Objective function for binary classification
diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp
index da604baeb49..c133e1f75fd 100644
--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
@@ -5,6 +5,9 @@
 #ifndef LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_
 
+#include <LightGBM/network.h>
+#include <LightGBM/objective_function.h>
+
 #include <string>
 #include <algorithm>
 #include <cmath>
@@ -12,9 +15,6 @@
 #include <memory>
 #include <vector>
 
-#include <LightGBM/network.h>
-#include <LightGBM/objective_function.h>
-
 #include "binary_objective.hpp"
 
 namespace LightGBM {
diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp
index 730d358f030..1b20721e238 100644
--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -6,6 +6,9 @@
 #ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_
 
+#include <LightGBM/metric.h>
+#include <LightGBM/objective_function.h>
+
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
@@ -14,9 +17,6 @@
 #include <string>
 #include <vector>
 
-#include <LightGBM/metric.h>
-#include <LightGBM/objective_function.h>
-
 namespace LightGBM {
 
 /*!
diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp
index fe391f2c15f..53fa7020be5 100644
--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -5,14 +5,14 @@
 #ifndef LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
 
-#include <string>
-#include <algorithm>
-#include <vector>
-
 #include <LightGBM/meta.h>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/utils/array_args.h>
 
+#include <string>
+#include <algorithm>
+#include <vector>
+
 namespace LightGBM {
 
 #define PercentileFun(T, data_reader, cnt_data, alpha)                    \
diff --git a/src/objective/xentropy_objective.hpp b/src/objective/xentropy_objective.hpp
index 3a145bd2cbb..cad2b8faafd 100644
--- a/src/objective/xentropy_objective.hpp
+++ b/src/objective/xentropy_objective.hpp
@@ -5,16 +5,16 @@
 #ifndef LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_
 
+#include <LightGBM/meta.h>
+#include <LightGBM/objective_function.h>
+#include <LightGBM/utils/common.h>
+
 #include <string>
 #include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <vector>
 
-#include <LightGBM/meta.h>
-#include <LightGBM/objective_function.h>
-#include <LightGBM/utils/common.h>
-
 /*
  * Implements gradients and hessians for the following point losses.
  * Target y is anything in interval [0, 1].
diff --git a/src/treelearner/col_sampler.hpp b/src/treelearner/col_sampler.hpp
index 4876827eb39..cd288481255 100644
--- a/src/treelearner/col_sampler.hpp
+++ b/src/treelearner/col_sampler.hpp
@@ -6,9 +6,6 @@
 #ifndef LIGHTGBM_TREELEARNER_COL_SAMPLER_HPP_
 #define LIGHTGBM_TREELEARNER_COL_SAMPLER_HPP_
 
-#include <algorithm>
-#include <vector>
-
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/utils/common.h>
diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp
index fda2bffc820..fad966e6487 100644
--- a/src/treelearner/cost_effective_gradient_boosting.hpp
+++ b/src/treelearner/cost_effective_gradient_boosting.hpp
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
 #define LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
 
-#include <vector>
-
 #include <LightGBM/config.h>
 #include <LightGBM/dataset.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 
+#include <vector>
+
 #include "data_partition.hpp"
 #include "serial_tree_learner.h"
 #include "split_info.hpp"
diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp
index bdae960c1d9..01c5d2606e7 100644
--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -5,15 +5,15 @@
 #ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
 #define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
 
-#include <algorithm>
-#include <cstring>
-#include <vector>
-
 #include <LightGBM/dataset.h>
 #include <LightGBM/meta.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
 
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
 namespace LightGBM {
 /*!
 * \brief DataPartition is used to store the the partition of data on tree.
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index 4dd5f29af0a..8916ee48fd4 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -6,6 +6,10 @@
 #ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
 #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
 
+#include <LightGBM/bin.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/utils/array_args.h>
+
 #include <algorithm>
 #include <cmath>
 #include <cstring>
@@ -13,10 +17,6 @@
 #include <utility>
 #include <vector>
 
-#include <LightGBM/bin.h>
-#include <LightGBM/dataset.h>
-#include <LightGBM/utils/array_args.h>
-
 #include "monotone_constraints.hpp"
 #include "split_info.hpp"
 
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index 42967e138c7..fad02e1c044 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -6,12 +6,12 @@
 
 #include "gpu_tree_learner.h"
 
-#include <algorithm>
-
 #include <LightGBM/bin.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 
+#include <algorithm>
+
 #include "../io/dense_bin.hpp"
 
 #define GPU_DEBUG 0
diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h
index c0607cbe4a2..598e8d40ac9 100644
--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -5,6 +5,12 @@
 #ifndef LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_
 #define LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_
 
+#include <LightGBM/dataset.h>
+#include <LightGBM/feature_group.h>
+#include <LightGBM/tree.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/random.h>
+
 #include <string>
 #include <cmath>
 #include <cstdio>
@@ -12,12 +18,6 @@
 #include <random>
 #include <vector>
 
-#include <LightGBM/dataset.h>
-#include <LightGBM/feature_group.h>
-#include <LightGBM/tree.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/random.h>
-
 #include "data_partition.hpp"
 #include "feature_histogram.hpp"
 #include "leaf_splits.hpp"
diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp
index 5c94846eb7b..b0a753eafe1 100644
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -5,11 +5,11 @@
 #ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
 #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
 
+#include <LightGBM/meta.h>
+
 #include <limits>
 #include <vector>
 
-#include <LightGBM/meta.h>
-
 #include "data_partition.hpp"
 
 namespace LightGBM {
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 5fbad7352e0..2fdf542d421 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -5,13 +5,13 @@
 #ifndef LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
 #define LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
 
+#include <LightGBM/network.h>
+#include <LightGBM/utils/array_args.h>
+
 #include <cstring>
 #include <memory>
 #include <vector>
 
-#include <LightGBM/network.h>
-#include <LightGBM/utils/array_args.h>
-
 #include "gpu_tree_learner.h"
 #include "serial_tree_learner.h"
 #include "cuda_tree_learner.h"
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 9aac1bdf1fd..e5b6626a6bd 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -4,16 +4,16 @@
  */
 #include "serial_tree_learner.h"
 
-#include <algorithm>
-#include <queue>
-#include <unordered_map>
-#include <utility>
-
 #include <LightGBM/network.h>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/common.h>
 
+#include <algorithm>
+#include <queue>
+#include <unordered_map>
+#include <utility>
+
 #include "cost_effective_gradient_boosting.hpp"
 
 namespace LightGBM {
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 6fc0c2cb968..367c262192c 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -5,13 +5,6 @@
 #ifndef LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
 #define LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
 
-#include <string>
-#include <cmath>
-#include <cstdio>
-#include <memory>
-#include <random>
-#include <vector>
-
 #include <LightGBM/dataset.h>
 #include <LightGBM/tree.h>
 #include <LightGBM/tree_learner.h>
@@ -19,6 +12,13 @@
 #include <LightGBM/utils/json11.h>
 #include <LightGBM/utils/random.h>
 
+#include <string>
+#include <cmath>
+#include <cstdio>
+#include <memory>
+#include <random>
+#include <vector>
+
 #include "col_sampler.hpp"
 #include "data_partition.hpp"
 #include "feature_histogram.hpp"
diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp
index 72dd3fa324c..492434d5160 100644
--- a/src/treelearner/split_info.hpp
+++ b/src/treelearner/split_info.hpp
@@ -5,6 +5,8 @@
 #ifndef LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
 #define LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
 
+#include <LightGBM/meta.h>
+
 #include <limits>
 #include <cmath>
 #include <cstdint>
@@ -12,8 +14,6 @@
 #include <functional>
 #include <vector>
 
-#include <LightGBM/meta.h>
-
 namespace LightGBM {
 
 /*!
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 043bf0e11b3..58f5b88d6b0 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -2,13 +2,12 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+#include <LightGBM/utils/common.h>
 
 #include <cstring>
 #include <tuple>
 #include <vector>
 
-#include <LightGBM/utils/common.h>
-
 #include "parallel_tree_learner.h"
 
 namespace LightGBM {

From 55f24ccbb3b3d2e8f598f8d2616c217f556e7084 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 11 Jun 2020 19:11:12 +0000
Subject: [PATCH 067/119] Missing change from previous rebase

---
 src/c_api.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 4820690fd7e..38957a13fc2 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -642,12 +642,6 @@ const char* LGBM_GetLastError() {
   return LastErrorMsg();
 }
 
-int LGBM_RegisterLogCallback(void (*callback)(const char*)) {
-  API_BEGIN();
-  Log::ResetCallBack(callback);
-  API_END();
-}
-
 int LGBM_GetDeviceType() {
 #ifdef USE_GPU
   return 1;

From 8e028f3319ed2819814b39e28facaa33b569cba3 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 11 Jun 2020 19:38:52 +0000
Subject: [PATCH 068/119] Minor cleanup and removal of development scripts.

---
 CMakeLists.txt        | 8 ++------
 build_LGBM.232.sh     | 7 -------
 install_LGBM.232.sh   | 7 -------
 src/boosting/gbdt.cpp | 5 -----
 test_LGBM.232.sh      | 5 -----
 5 files changed, 2 insertions(+), 30 deletions(-)
 delete mode 100755 build_LGBM.232.sh
 delete mode 100755 install_LGBM.232.sh
 delete mode 100755 test_LGBM.232.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7bcd068f3ca..3e237da69c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,9 +173,6 @@ if(USE_CUDA)
      -DIGNORE_INDICES
     )
 
-    #string(REPLACE ";" " " BASE_DEFINES "${BASE_DEFINES}")
-    #string(REPLACE ";" " " ALLFEATS_DEFINES "${ALLFEATS_DEFINES}")
-    #string(REPLACE ";" " " FULLDATA_DEFINES "${FULLDATA_DEFINES}")
     message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES})
     message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES})
 
@@ -193,7 +190,6 @@ if(USE_CUDA)
       )
     endfunction()
 
-    #foreach (hsize 16 64 256)
     foreach (hsize _16_64_256)
       add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}")
       add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}")
@@ -310,9 +306,9 @@ file(GLOB SOURCES
     src/objective/*.cpp
     src/network/*.cpp
     src/treelearner/*.cpp
-#ifdef USE_CUDA
+if(USE_CUDA)
     src/treelearner/*.cu
-#endif
+endif(USE_CUDA)
 )
 
 add_executable(lightgbm src/main.cpp ${SOURCES})
diff --git a/build_LGBM.232.sh b/build_LGBM.232.sh
deleted file mode 100755
index f785d6556e6..00000000000
--- a/build_LGBM.232.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/bash
-rm -rf build
-mkdir build
-cd build
-cmake -DUSE_CUDA=1 ..
-#cmake ..
-make -j40
diff --git a/install_LGBM.232.sh b/install_LGBM.232.sh
deleted file mode 100755
index 7af586f4722..00000000000
--- a/install_LGBM.232.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/bash
-cd python-package
-python setup.py bdist_wheel
-pip uninstall -y lightgbm
-cd dist
-pip install lightgbm-*.whl
-cd ../..
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 24264c3c175..854e2af240c 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -68,11 +68,6 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   if (config_->device_type == std::string("cuda")) {
     // LGBM_config_::current_device=lgbm_device_cuda; moved to application.cpp
      LGBM_config_::current_learner=use_cuda_learner;
-
-  /* Following are needed to ensure bagging required by the CUDA implementation */
-//  if (config_->bagging_fraction == 1.0){config_->bagging_fraction = 0.8;} moved to application.cpp
-//  if (config_->bagging_freq == 0) {config_->bagging_freq = 1;} moved to application.cpp
-
   }
 #endif
 
diff --git a/test_LGBM.232.sh b/test_LGBM.232.sh
deleted file mode 100755
index cd5146f959f..00000000000
--- a/test_LGBM.232.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-python -m unittest tests/python_package_test/test_basic.py
-python -m unittest tests/python_package_test/test_consistency.py
-python -m unittest tests/python_package_test/test_engine.py
-python -m unittest tests/python_package_test/test_plotting.py
-python -m unittest tests/python_package_test/test_sklearn.py

From f4725e1a1307a565422152f1411ee5f598fa1371 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 15 Jun 2020 17:20:04 +0000
Subject: [PATCH 069/119] Only set gpu_use_dp on by default for CUDA. Other
 minor change.

---
 src/boosting/gbdt.cpp  | 5 ++++-
 src/io/config_auto.cpp | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 854e2af240c..546da0c0cbe 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -270,7 +270,10 @@ void GBDT::Bagging(int iter) {
         tmp_hessians_.resize(total_size);
       }
 
-      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
+      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
+                             bag_data_cnt_, false);
+      tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
+                                    bag_data_cnt_);
 
       tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_);
     }
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 9408a97c70f..b0cd57deb69 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -613,9 +613,9 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
   GetInt(params, "gpu_device_id", &gpu_device_id);
 
   GetBool(params, "gpu_use_dp", &gpu_use_dp);
+#ifdef USE_CUDA
   gpu_use_dp = true;  /* LGBM_CUDA hard-coding gpu_use_dp to TRUE (default is false) */
 
-#ifdef USE_CUDA
   /* LGBM_CUDA get number of GPUs */
   GetInt(params, "num_gpu", &num_gpu);
   CHECK(num_gpu > 0);

From 0e84c152869d0d5b9b6440718cb9dbc770e7b874 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 15 Jun 2020 18:03:32 +0000
Subject: [PATCH 070/119] Fix python lint indentation problem.

---
 tests/python_package_test/test_engine.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 51ab32a239b..791ef94a4be 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -991,7 +991,7 @@ def train_and_get_predictions(features, labels):
                 'min_data': 5,
             }
             if lgb.get_device_type() == 2:
-               lgb_params["device"] = "cuda"
+                lgb_params["device"] = "cuda"
             gbm = lgb.train(
                 params=lgb_params,
                 train_set=dataset,
@@ -1723,10 +1723,10 @@ def train_booster(params=params_obj_verbose, **kwargs):
                 params_obj_class_3_verbose["device"] = "cuda"
             params_obj_class_1_verbose = {'objective': obj_multi_alias, 'num_class': 1, 'verbose': -1}
             if lgb.get_device_type() == 2:
-               params_obj_class_1_verbose["device"] = "cuda"
+                params_obj_class_1_verbose["device"] = "cuda"
             params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1}
             if lgb.get_device_type() == 2:
-               params_obj_verbose["device"] = "cuda"
+                params_obj_verbose["device"] = "cuda"
             # multiclass default metric
             res = get_cv_result(params_obj_class_3_verbose)
             self.assertEqual(len(res), 2)

From ccf7602e7cb824d26128801f0329589524e8d482 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 15 Jun 2020 18:36:53 +0000
Subject: [PATCH 071/119] More python lint issues.

---
 python-package/setup.py                  | 2 +-
 tests/python_package_test/test_engine.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python-package/setup.py b/python-package/setup.py
index eca56783713..1e0500f180c 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -225,7 +225,7 @@ def initialize_options(self):
         self.openmp_library = None
         self.mpi = 0
         self.hdfs = 0
-        #self.precompile = 0 #TODO: revert this
+        # self.precompile = 0 # TODO: revert this
         self.precompile = 1
         self.nomp = 0
         self.bit32 = 0
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 791ef94a4be..b5de6a9a4c7 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1429,6 +1429,7 @@ def test_metrics(self):
 
         def get_cv_result(params=params_obj_verbose, **kwargs):
             return lgb.cv(params, lgb_train, num_boost_round=2, verbose_eval=False, **kwargs)
+
         def train_booster(params=params_obj_verbose, **kwargs):
             lgb.train(params, lgb_train,
                       num_boost_round=2,

From c41771585212b2535673b96f3024633224a9e97d Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 15 Jun 2020 21:35:40 +0000
Subject: [PATCH 072/119] Big lint cleanup - more to come.

---
 include/LightGBM/application.h                |   2 +-
 include/LightGBM/cuda/cuda_utils.h            |  14 +-
 include/LightGBM/cuda/vector_cudahost.h       |  71 ++-
 include/LightGBM/tree_learner.h               |   2 +-
 include/LightGBM/utils/common.h               |   2 -
 src/application/application.cpp               |  11 +-
 src/boosting/gbdt.cpp                         |  20 +-
 src/boosting/gbdt.h                           |  10 +-
 src/c_api.cpp                                 |  13 +-
 src/io/config.cpp                             |   2 +-
 src/io/dataset.cpp                            |   8 +-
 src/io/dense_bin.hpp                          |  18 +-
 src/treelearner/cuda_kernel_launcher.cu       |   4 +-
 src/treelearner/cuda_kernel_launcher.h        |  50 +-
 src/treelearner/cuda_tree_learner.cpp         | 257 +++++-----
 src/treelearner/cuda_tree_learner.h           | 484 +++++++++---------
 .../data_parallel_tree_learner.cpp            |   6 +-
 .../feature_parallel_tree_learner.cpp         |   6 +-
 src/treelearner/gpu_tree_learner.cpp          |   2 +-
 src/treelearner/gpu_tree_learner.h            |   2 +-
 .../kernels/histogram_16_64_256.cu            |  32 +-
 .../kernels/histogram_16_64_256.hu            |   4 +-
 src/treelearner/parallel_tree_learner.h       |   6 +-
 src/treelearner/serial_tree_learner.cpp       |   8 +-
 src/treelearner/serial_tree_learner.h         |  10 +-
 src/treelearner/tree_learner.cpp              |   4 +-
 .../voting_parallel_tree_learner.cpp          |   6 +-
 27 files changed, 513 insertions(+), 541 deletions(-)

diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h
index 911dedd7d94..7ce8956a555 100644
--- a/include/LightGBM/application.h
+++ b/include/LightGBM/application.h
@@ -38,7 +38,7 @@ class Application {
 
   // LGBM_CUDA
   /*! \brief call to get configuration */
-  Config GetConfig() {return config_ ;} ;
+  Config GetConfig() {return config_ ;}
 
  private:
   /*! \brief Load parameters from command line and config file*/
diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index e57d3746a21..7ff7b28c8f1 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -5,7 +5,7 @@
 #ifndef LGBM_CUDA_UTILS_H
 #define LGBM_CUDA_UTILS_H
 
-//LGBM_CUDA
+// LGBM_CUDA
 
 #ifdef USE_CUDA
 
@@ -14,13 +14,11 @@
 #include <stdio.h>
 
 #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-   if (code != cudaSuccess)
-   {
-      LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line);
-      if (abort) exit(code);
-   }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+  if (code != cudaSuccess) {
+    LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line);
+    if (abort) exit(code);
+  }
 }
 
 #endif /* USE_CUDA */
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index 61d6e464970..b964fa4ad1f 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -9,7 +9,7 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 
-//LGBM_CUDA
+// LGBM_CUDA
 
 namespace LightGBM {
 
@@ -22,60 +22,55 @@ namespace LightGBM {
 #define use_cuda_learner 2
 
 class LGBM_config_ {
- public:
-  static int current_device; // Default: lgbm_device_cpu 
-  static int current_learner; // Default: use_cpu_learner
+  public:
+    static int current_device;  // Default: lgbm_device_cpu 
+    static int current_learner;  // Default: use_cpu_learner
 };
 
-} // namespace LightGBM
+}  // namespace LightGBM
 
 
 template <class T>
 struct CHAllocator {
- typedef T value_type;
- CHAllocator() {}
- template <class U> CHAllocator(const CHAllocator<U>& other);
- T* allocate(std::size_t n)
- {
-   T* ptr;
-   if (n == 0) return NULL;
-   #ifdef USE_CUDA
-      if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){
-          cudaError_t ret= cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
-          if (ret != cudaSuccess){
-fprintf(stderr, "   TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr);
-             ptr = (T*) malloc(n*sizeof(T));
-          }
-      }
-      else{
-            ptr = (T*) malloc(n*sizeof(T));
+  typedef T value_type;
+  CHAllocator() {}
+  template <class U> CHAllocator(const CHAllocator<U>& other);
+  T* allocate(std::size_t n) {
+    T* ptr;
+    if (n == 0) return NULL;
+    #ifdef USE_CUDA
+      if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) {
+        cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
+        if (ret != cudaSuccess) {
+          fprintf(stderr, "   TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr);
+          ptr = reinterpret_cast<T*>(malloc(n*sizeof(T)));
+        }
+      } else {
+        ptr = reinterpret_cast<T*>(malloc(n*sizeof(T)));
       }
    #else
-      ptr = (T*) malloc(n*sizeof(T));
+      ptr = reinterpret_cast<T*>(malloc(n*sizeof(T)));
    #endif
-   return ptr;
- }
+    return ptr;
+  }
 
- void deallocate(T* p, std::size_t n)
- {
+  void deallocate(T* p, std::size_t n) {
     (void)n;  // UNUSED
-    if (p==NULL) return;
+    if (p == NULL) return;
     #ifdef USE_CUDA
-      if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){
-          cudaPointerAttributes attributes;
-          cudaPointerGetAttributes (&attributes, p);
-          if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)){
-              cudaFreeHost(p);
-          }
-      } 
-      else{
+      if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) {
+        cudaPointerAttributes attributes;
+        cudaPointerGetAttributes (&attributes, p);
+        if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
+          cudaFreeHost(p);
+        }
+      } else { 
         free(p);
       }
     #else
-        free(p);
+      free(p);
     #endif
  }
-
 };
 template <class T, class U>
 bool operator==(const CHAllocator<T>&, const CHAllocator<U>&);
diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index 6c549a5ed71..2ea30ac63b2 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -57,7 +57,7 @@ class TreeLearner {
   * \param is_constant_hessian True if all hessians share the same value
   * \return A trained tree
   */
-  virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, Json& forced_split_json) = 0;
+  virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, const Json& forced_split_json) = 0;
 
   /*!
   * \brief use an existing tree to fit the new gradients and hessians.
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index 0e26ee84faa..bdc769e5222 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -30,8 +30,6 @@
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
 #endif
-#include <LightGBM/utils/log.h>
-#include <LightGBM/utils/openmp_wrapper.h>
 
 #if defined(_MSC_VER)
 #include <malloc.h>
diff --git a/src/application/application.cpp b/src/application/application.cpp
index 1b9eabf8a12..5c61b323654 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -43,17 +43,16 @@ Application::Application(int argc, char** argv) {
     Log::Fatal("No training/prediction data, application quit");
   }
 
-//LGBM_CUDA
+// LGBM_CUDA
 #ifdef USE_CUDA
-  if (config_.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+  if (config_.device_type == std::string("cuda")) {
+      LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
 
       config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;}
-      if (config_.bagging_freq == 0) {config_.bagging_freq = 1;}
+      if (config_.bagging_fraction == 1.0) { config_.bagging_fraction = 0.8; }
+      if (config_.bagging_freq == 0) { config_.bagging_freq = 1; }
   }
 #endif
-
 }
 
 Application::~Application() {
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 546da0c0cbe..d102f6aedc1 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -18,8 +18,8 @@
 namespace LightGBM {
 
 #ifdef USE_CUDA
-int LGBM_config_::current_device=lgbm_device_cpu;
-int LGBM_config_::current_learner=use_cpu_learner;
+int LGBM_config_::current_device = lgbm_device_cpu;
+int LGBM_config_::current_learner = use_cpu_learner;
 #endif
 
 GBDT::GBDT()
@@ -66,8 +66,8 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
 // LGBM_CUDA
 #ifdef USE_CUDA
   if (config_->device_type == std::string("cuda")) {
-    // LGBM_config_::current_device=lgbm_device_cuda; moved to application.cpp
-     LGBM_config_::current_learner=use_cuda_learner;
+    // LGBM_config_::current_device = lgbm_device_cuda; moved to application.cpp
+    LGBM_config_::current_learner = use_cuda_learner;
   }
 #endif
 
@@ -260,7 +260,7 @@ void GBDT::Bagging(int iter) {
     // set bagging data to tree learner
     if (!is_use_subset_) {
       tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-    } else { // LGBM_CUDA
+    } else {  // LGBM_CUDA
       // NEW get subset
       bool resized= tmp_subset_->ReSize(bag_data_cnt_);
 
@@ -284,7 +284,7 @@ void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
   Common::FunctionTimer fun_timer("GBDT::Train", global_timer);
   bool is_finished = false;
 
- //LGBM_CUDA
+ // LGBM_CUDA
   auto start_time = std::chrono::steady_clock::now();
 
   for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) {
@@ -437,8 +437,8 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
       
         #pragma omp parallel for schedule(static) // LGBM_CUDA
         for (int i = 0; i < bag_data_cnt_; ++i) {
-          tmp_grad[i] = grad[bag_data_indices_[i]]; // LGBM_CUDA
-          tmp_hess[i] = hess[bag_data_indices_[i]]; // LGBM_CUDA
+          tmp_grad[i] = grad[bag_data_indices_[i]];  // LGBM_CUDA
+          tmp_hess[i] = hess[bag_data_indices_[i]];  // LGBM_CUDA
         }
       }
 
@@ -509,7 +509,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
 
 bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
 
-  if (config_->device_type == std::string("cuda")){ //LGBM_CUDA
+  if (config_->device_type == std::string("cuda")) {  // LGBM_CUDA
      return TrainOneIterCUDA(gradients, hessians);
   }
 
@@ -965,7 +965,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
     }
   } else {
     bag_data_cnt_ = num_data_;
-    if (config_->device_type == std::string("cuda")){ // LGBM_CUDA
+    if (config_->device_type == std::string("cuda")) {  // LGBM_CUDA
        if (tmp_subset_ == nullptr){
           tmp_subset_.reset(new Dataset(bag_data_cnt_));
           tmp_subset_->CopyFeatureMapperFrom(train_data_);
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 02476f810a8..99bf64a6fb0 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -25,7 +25,7 @@
 #include "score_updater.hpp"
 
 #ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h> //LGBM_CUDA
+#include <LightGBM/cuda/vector_cudahost.h>  // LGBM_CUDA
 #endif
 
 namespace LightGBM {
@@ -478,11 +478,11 @@ class GBDT : public GBDTBase {
 
 #ifdef USE_CUDA
   /*! \brief First order derivative of training data */
-  std::vector<score_t,CHAllocator<score_t>> gradients_; // LGBM_CUDA
-  std::vector<score_t,CHAllocator<score_t>> tmp_gradients_; // LGBM_CUDA
+  std::vector<score_t,CHAllocator<score_t>> gradients_;  // LGBM_CUDA
+  std::vector<score_t,CHAllocator<score_t>> tmp_gradients_;  // LGBM_CUDA
   /*! \brief Second order derivative of training data */
-  std::vector<score_t, CHAllocator<score_t>> hessians_; // LGBM_CUDA
-  std::vector<score_t, CHAllocator<score_t>> tmp_hessians_; // LGBM_CUDA
+  std::vector<score_t, CHAllocator<score_t>> hessians_;  // LGBM_CUDA
+  std::vector<score_t, CHAllocator<score_t>> tmp_hessians_;  // LGBM_CUDA
 #else
   /*! \brief First order derivative of training data */
   std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> gradients_;
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 38957a13fc2..0ce92342fb6 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -43,16 +43,15 @@ inline int LGBM_APIHandleException(const std::string& ex) {
   return -1;
 }
 
-//LGBM_CUDA
-inline void AdditionalConfig(Config *config)
-{
+// LGBM_CUDA
+inline void AdditionalConfig(Config *config) {
 #ifdef USE_CUDA
-  if (config->device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+  if (config->device_type == std::string("cuda")) {
+      LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
 
       config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;}
-      if (config->bagging_freq == 0) {config->bagging_freq = 1;}
+      if (config->bagging_fraction == 1.0) { config->bagging_fraction = 0.8; }
+      if (config->bagging_freq == 0) { config->bagging_freq = 1; }
   }
 #else
   (void)(config);       // UNUSED
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 18c0562a676..ed643204c91 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -126,7 +126,7 @@ void GetDeviceType(const std::unordered_map<std::string, std::string>& params, s
       *device_type = "cpu";
     } else if (value == std::string("gpu")) {
       *device_type = "gpu";
-    } else if (value == std::string("cuda")) { // LGBM_CUDA
+    } else if (value == std::string("cuda")) {  // LGBM_CUDA
       *device_type = "cuda";
     } else {
       Log::Fatal("Unknown device type %s", value.c_str());
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 3d4e29be1fe..df18ef6f838 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -345,16 +345,16 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
   }
   auto features_in_group = NoGroup(used_features);
 
-//LGBM_CUDA
+// LGBM_CUDA
 #ifdef USE_CUDA
-  if (io_config.device_type == std::string("cuda")){
-      LightGBM::LGBM_config_::current_device=lgbm_device_cuda;
+  if (io_config.device_type == std::string("cuda")) {
+      LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
   }
 #endif
 
   std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
   if (io_config.enable_bundle && !used_features.empty()) {
-    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); // LGBM_CUDA
+    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");  // LGBM_CUDA
     features_in_group = FastFeatureBundling(*bin_mappers, 
                                             sample_non_zero_indices, 
                                             sample_values, 
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 99feadf9f7f..89f29a99bdc 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -13,10 +13,10 @@
 #include <vector>
 
 #ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h> // LGBM_CUDA
+#include <LightGBM/cuda/vector_cudahost.h>  // LGBM_CUDA
 #endif
 
-#include <LightGBM/utils/openmp_wrapper.h> // LGBM_CUDA
+#include <LightGBM/utils/openmp_wrapper.h>  // LGBM_CUDA
 
 namespace LightGBM {
 
@@ -368,7 +368,7 @@ class DenseBin : public Bin {
 
   data_size_t num_data() const override { return num_data_; }
 
- // LGBM_CUDA
+  // LGBM_CUDA
   void* get_data() override { return data_.data(); }
 
   void FinishLoad() override {
@@ -464,16 +464,16 @@ class DenseBin : public Bin {
   DenseBin<VAL_T, IS_4BIT>* Clone() override;
 
  private:
-  data_size_t num_data_;
+   data_size_t num_data_;
 #ifdef USE_CUDA
-  std::vector<VAL_T, CHAllocator<VAL_T>> data_; // LGBM_CUDA
+   std::vector<VAL_T, CHAllocator<VAL_T>> data_; // LGBM_CUDA
 #else
-  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
+   std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
 #endif
-  std::vector<uint8_t> buf_;
+   std::vector<uint8_t> buf_;
 
-  DenseBin<VAL_T, IS_4BIT>(const DenseBin<VAL_T, IS_4BIT>& other)
-      : num_data_(other.num_data_), data_(other.data_) {}
+   DenseBin<VAL_T, IS_4BIT>(const DenseBin<VAL_T, IS_4BIT>& other)
+       : num_data_(other.num_data_), data_(other.data_) {}
 };
 
 template <typename VAL_T, bool IS_4BIT>
diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
index dad8b6c563b..6e3149dae06 100644
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
@@ -8,7 +8,7 @@
    using namespace LightGBM;
    
    void cuda_histogram(
-     		int		histogram_size,
+                int             histogram_size,
                 data_size_t     leaf_num_data,
                 data_size_t     num_data,
                 bool            use_all_features,
@@ -25,7 +25,7 @@
                 score_t         arg6_const,
                 char*           arg7,
                 volatile int*   arg8,
-                void*		arg9,
+                void*           arg9,
                 size_t          exp_workgroups_per_feature) {
    
    if (histogram_size == 16) {
diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h
index 1241a9cafb9..efe8e4b0d4a 100644
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
@@ -1,9 +1,13 @@
+/*!
+ * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
 #ifndef LGBM_KERNEL_LAUNCHER
 #define LGBM_KERNEL_LAUNCHER
 
 #ifdef USE_CUDA
 // what should I include??
-#include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t
+#include "kernels/histogram_16_64_256.hu"  // kernel, acc_type, data_size_t, uchar, score_t
 #include <chrono>
 
 struct ThreadData {
@@ -19,9 +23,9 @@ struct ThreadData {
           cudaStream_t    stream;
           uint8_t*        device_features;
           uint8_t*        device_feature_masks;
-          //data_size_t     num_data;
+          // data_size_t     num_data;
           data_size_t*    device_data_indices;
-          //data_size_t     leaf_num_data;
+          // data_size_t     leaf_num_data;
           score_t*        device_gradients;
           score_t*        device_hessians;
           score_t         hessians_const;
@@ -41,26 +45,26 @@ struct ThreadData {
 
 
 void cuda_histogram(
-		int		histogram_size,
-		data_size_t	leaf_num_data, 
-		data_size_t	num_data,
-		bool		use_all_features, 
-		bool		is_constant_hessian, 
-		int		num_workgroups,
-		cudaStream_t	stream,
-		uint8_t*	arg0,
-		uint8_t*	arg1,
-		data_size_t	arg2,
-		data_size_t*	arg3,
-		data_size_t	arg4,
-		score_t*	arg5,
-		score_t*	arg6,
-		score_t		arg6_const,
-		char*		arg7,
-		volatile int*	arg8,
-		void*		arg9,
-		size_t		exp_workgroups_per_feature);
+                int             histogram_size,
+                data_size_t     leaf_num_data, 
+                data_size_t     num_data,
+                bool            use_all_features, 
+                bool            is_constant_hessian, 
+                int             num_workgroups,
+                cudaStream_t    stream,
+                uint8_t*        arg0,
+                uint8_t*        arg1,
+                data_size_t     arg2,
+                data_size_t*    arg3,
+                data_size_t     arg4,
+                score_t*        arg5,
+                score_t*        arg6,
+                score_t         arg6_const,
+                char*           arg7,
+                volatile int*   arg8,
+                void*           arg9,
+                size_t          exp_workgroups_per_feature);
 
 
-#endif //USE_CUDA
+#endif // USE_CUDA
 #endif // LGBM_KERNEL_LAUNCHER
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index f45319ae818..0588ad14fe0 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -1,4 +1,13 @@
+/*!
+ * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
 #ifdef USE_CUDA
+#include <algorithm>
+#include <vector>
+
+#include <pthread.h>
+
 #include "cuda_tree_learner.h"
 #include "../io/dense_bin.hpp"
 
@@ -6,20 +15,15 @@
 #include <LightGBM/network.h>
 #include <LightGBM/bin.h>
 
-#include <algorithm>
-#include <vector>
-
 #include <LightGBM/cuda/cuda_utils.h>
 
-#define cudaMemcpy_DEBUG 0 // 1: DEBUG cudaMemcpy
-#define ResetTrainingData_DEBUG 0 // 1: Debug ResetTrainingData
-
-#include <pthread.h>
+#define cudaMemcpy_DEBUG 0  // 1: DEBUG cudaMemcpy
+#define ResetTrainingData_DEBUG 0  // 1: Debug ResetTrainingData
 
 #define GPU_DEBUG 0
 
 static void *launch_cuda_histogram(void *thread_data) {
-  ThreadData td = *(ThreadData*)thread_data;
+  ThreadData td = *(reinterpret_cast<ThreadData*>(thread_data));
   int device_id = td.device_id;
   CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
 
@@ -54,7 +58,7 @@ CUDATreeLearner::CUDATreeLearner(const Config* config)
   :SerialTreeLearner(config) {
   use_bagging_ = false;
   nthreads_ = 0;
-  if(config->gpu_use_dp && USE_DP_FLOAT) Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!");
+  if (config->gpu_use_dp && USE_DP_FLOAT) Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!");
   else Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!");
 }
 
@@ -75,7 +79,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian,
   is_use_subset_ = is_use_subset;  
 
   // Initialize GPU buffers and kernels & LGBM_CUDA: get device info
-  InitGPU(config_->num_gpu); // LGBM_CUDA
+  InitGPU(config_->num_gpu);  // LGBM_CUDA
 }
 
 // some functions used for debugging the GPU histogram construction
@@ -104,7 +108,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
   int i;
   int retval = 0;
   printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size);
-  if (dp_flag) { // double precision
+  if (dp_flag) {  // double precision
     double af, bf;
     long long int ai, bi;
     for (i = 0; i < (int) size; ++i) {
@@ -121,8 +125,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
           printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi);
           ++retval;
         }
-      }
-      else {
+      } else {
         af = GET_HESS(h1, i);
         bf = GET_HESS(h2, i);
         if (((std::fabs(af - bf))/af) >= 1e-6) {
@@ -131,8 +134,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
         }
       }
     }
-  }
-  else { // single precision
+  } else {  // single precision
     float af, bf;
     int ai, bi;
     for (i = 0; i < (int) size; ++i) {
@@ -149,8 +151,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
           printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi);
           ++retval;
         }
-      }
-      else {
+      } else {
         af = GET_HESS(h1, i);
         bf = GET_HESS(h2, i);
         if (((std::fabs(af - bf))/af) >= 1e-5) {
@@ -199,7 +200,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
   std::vector<int> num_gpu_workgroups;
   ThreadData *thread_data = (ThreadData*)malloc(sizeof(ThreadData) * num_gpu_);
 
-  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
     int num_workgroups = (1 << exp_workgroups_per_feature) * num_gpu_feature_groups;
     num_gpu_workgroups.push_back(num_workgroups);
@@ -213,8 +214,8 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
                   num_workgroups, exp_workgroups_per_feature);
   }
  
-  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
-    if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))){
+  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
+    if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))) {
         fprintf(stderr, "Error in creating threads. Exiting\n");
         exit(0);
     }
@@ -222,14 +223,14 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
 
   /* Wait for the threads to finish */
 
-  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
-    if (pthread_join(*(cpu_threads_[device_id]), NULL)){
+  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
+    if (pthread_join(*(cpu_threads_[device_id]), NULL)) {
       fprintf(stderr, "Error in joining threads. Exiting\n");
       exit(0);
     }
   }
 
-  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
 
     // copy the results asynchronously. Size depends on if double precision is used
 
@@ -247,7 +248,7 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra
   HistType* hist_outputs = (HistType*) host_histogram_outputs_;
 
   #pragma omp parallel for schedule(static, num_gpu_)
-  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
 
 //    auto start_time = std::chrono::steady_clock::now();
 
@@ -257,7 +258,7 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra
   }
 
   #pragma omp parallel for schedule(static)
-  for(int i = 0; i < num_dense_feature_groups_; ++i) {
+  for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (!feature_masks_[i]) {
       continue;
     }
@@ -286,13 +287,11 @@ void CUDATreeLearner::CountDenseFeatureGroups() {
   if (!num_dense_feature_groups_) {
     Log::Warning("GPU acceleration is disabled because no non-trival dense features can be found");
   }
-
 }
 
 // LGBM_CUDA
 void CUDATreeLearner::prevAllocateGPUMemory() {
 
-
   // how many feature-group tuples we have
   // leave some safe margin for prefetching
   // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
@@ -317,7 +316,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
 
   int offset = 0;
 
-  for(int i = 0; i < num_gpu_; ++i) {
+  for (int i = 0; i < num_gpu_; ++i) {
     offset_gpu_feature_groups_.at(i) = offset;
     num_gpu_feature_groups_.at(i) = (i < remain_features)? num_features_per_gpu + 1 : num_features_per_gpu;
     offset += num_gpu_feature_groups_.at(i);
@@ -329,7 +328,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
      cudaPointerAttributes attributes;
      cudaPointerGetAttributes (&attributes, feature_masks_.data());
     
-     if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)){ 
+     if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { 
         CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data()));
      }
   }
@@ -343,7 +342,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
   memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_);
 
   // histogram bin entry size depends on the precision (single/double)
-  hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t)); // two elements in this "size"
+  hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t));  // two elements in this "size"
 
   // host_size histogram outputs
   //  host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_);
@@ -360,7 +359,7 @@ void CUDATreeLearner::AllocateGPUMemory() {
 
   #pragma omp parallel for schedule(static, num_gpu_)
 
-  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     // do nothing it there is no gpu feature
     int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
     if (num_gpu_feature_groups) {
@@ -377,15 +376,15 @@ void CUDATreeLearner::AllocateGPUMemory() {
       // allocate space for gradients and hessians on device
       // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed
 
-      if (device_gradients_[device_id] != NULL){
+      if (device_gradients_[device_id] != NULL) {
         CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id]));
       }
 
-      if (device_hessians_[device_id] != NULL){
+      if (device_hessians_[device_id] != NULL) {
         CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id]));
       }
 
-      if (device_feature_masks_[device_id] != NULL){
+      if (device_feature_masks_[device_id] != NULL) {
          CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id]));
       }
 
@@ -396,7 +395,7 @@ void CUDATreeLearner::AllocateGPUMemory() {
 
       // copy indices to the device
 
-     if (device_data_indices_[device_id] != NULL){
+     if (device_data_indices_[device_id] != NULL) {
         CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); 
      }
 
@@ -427,7 +426,6 @@ void CUDATreeLearner::AllocateGPUMemory() {
       CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), (size_t) num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_));
     }
   }
-
 }
 
 void CUDATreeLearner::ResetGPUMemory() {
@@ -435,14 +433,13 @@ void CUDATreeLearner::ResetGPUMemory() {
   // clear sparse/dense maps
   dense_feature_group_map_.clear();
   sparse_feature_group_map_.clear();
-
 }
 
 // LGBM_CUDA
 void CUDATreeLearner::copyDenseFeature() {
 
- if (num_feature_groups_ == 0){
-      LGBM_config_::current_learner=use_cpu_learner;
+  if (num_feature_groups_ == 0) {
+      LGBM_config_::current_learner = use_cpu_learner;
       return;
   }
 
@@ -462,29 +459,27 @@ void CUDATreeLearner::copyDenseFeature() {
       dense_feature_group_map_.push_back(i);
       auto sizes_in_byte = train_data_->FeatureGroupSizesInByte(i);
       void* tmp_data = train_data_->FeatureGroupData(i);
-  	   Log::Debug("Started copying dense features from CPU to GPU - 2");
+           Log::Debug("Started copying dense features from CPU to GPU - 2");
       CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(&device_features[copied_feature * num_data_], tmp_data, sizes_in_byte, cudaMemcpyHostToDevice, stream_[device_id]));
-  	   Log::Debug("Started copying dense features from CPU to GPU - 3");
+           Log::Debug("Started copying dense features from CPU to GPU - 3");
       copied_feature++;
       // reset device info
-      if(copied_feature == (size_t) num_gpu_feature_groups_[device_id]) {
+      if (copied_feature == (size_t) num_gpu_feature_groups_[device_id]) {
          CUDASUCCESS_OR_FATAL(cudaEventRecord(features_future_[device_id], stream_[device_id]));
          device_id += 1;
          copied_feature = 0;
-         if(device_id < num_gpu_) {
+         if (device_id < num_gpu_) {
            device_features = device_features_[device_id];
            CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); 
          }
       }
-    }
-    else {
+    } else {
       sparse_feature_group_map_.push_back(i);
     }
   }
 
   // data transfer time // LGBM_CUDA: async copy, so it is not the real data transfer time
   // std::chrono::duration<double, std::milli> end_time = std::chrono::steady_clock::now() - start_time;
-
 }
 
 
@@ -513,35 +508,31 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   #endif
 
   if (max_num_bin_ <= 16) {
-    device_bin_size_ = 16; //LGBM_CUDA
+    device_bin_size_ = 16;  // LGBM_CUDA
     histogram_size_ = 16;
-    dword_features_ = 1; // LGBM_CUDA
-  }
-  else if (max_num_bin_ <= 64) {
-    device_bin_size_ = 64; //LGBM_CUDA
+    dword_features_ = 1;  // LGBM_CUDA
+  } else if (max_num_bin_ <= 64) {
+    device_bin_size_ = 64;  // LGBM_CUDA
     histogram_size_ = 64;
-    dword_features_ = 1; // LGBM_CUDA
-  }
-  else if ( max_num_bin_ <= 256) {
+    dword_features_ = 1;  // LGBM_CUDA
+  } else if ( max_num_bin_ <= 256) {
     Log::Debug("device_bin_size_ = 256");
     device_bin_size_ = 256;
     histogram_size_ = 256;
-    dword_features_ = 1; // LGBM_CUDA
-  }
-  else {
+    dword_features_ = 1;  // LGBM_CUDA
+  } else {
     Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
   }
-  if(max_num_bin_ == 65) {
+  if (max_num_bin_ == 65) {
     Log::Warning("Setting max_bin to 63 is sugguested for best performance");
   }
-  if(max_num_bin_ == 17) {
+  if (max_num_bin_ == 17) {
     Log::Warning("Setting max_bin to 15 is sugguested for best performance");
   }
 
   // LGBM_CUDA: get num_dense_feature_groups_
   CountDenseFeatureGroups();
 
-
   if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_;
  
   // LGBM_CUDA: initialize GPU
@@ -552,7 +543,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
 
   // LGBM_CUDA: set cpu threads
   cpu_threads_ = (pthread_t **)malloc(sizeof(pthread_t *)*num_gpu_);
-  for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     cpu_threads_[device_id] = (pthread_t *)malloc(sizeof(pthread_t)); 
   }
 
@@ -582,7 +573,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
   //kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
 
-  for(int i = 0; i < num_gpu_; ++i) {
+  for (int i = 0; i < num_gpu_; ++i) {
     CUDASUCCESS_OR_FATAL(cudaSetDevice(i));
     CUDASUCCESS_OR_FATAL(cudaStreamCreate(&(stream_[i])));
     CUDASUCCESS_OR_FATAL(cudaEventCreate(&(hessians_future_[i])));
@@ -603,13 +594,12 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
 
   if (!is_use_subset_) {
     Log::Debug("copyDenseFeature at the initialization\n");
-    copyDenseFeature(); // LGBM_CUDA
+    copyDenseFeature();  // LGBM_CUDA
   }
-
 }
 
 Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians,
-                            bool is_constant_hessian, Json& forced_split_json) {
+                            bool is_constant_hessian, const Json& forced_split_json) {
 
   // check if we need to recompile the GPU kernel (is_constant_hessian changed)
   // this should rarely occur
@@ -692,47 +682,47 @@ void CUDATreeLearner::BeforeTrain() {
   // Copy initial full hessians and gradients to GPU.
   // We start copying as early as possible, instead of at ConstructHistogram().
 
-  if ((hessians_ != NULL) && (gradients_ != NULL)){
-  if (!use_bagging_ && num_dense_feature_groups_) {
+  if ((hessians_ != NULL) && (gradients_ != NULL)) {
+    if (!use_bagging_ && num_dense_feature_groups_) {
 
-    Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_);
+      Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_);
 
-    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
-      if (!is_constant_hessian_) {
-        Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_");
+      for (int device_id = 0; device_id < num_gpu_; ++device_id) {
+        if (!is_constant_hessian_) {
+          Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_");
 
-        #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
-        auto start_device_hessians_time = std::chrono::steady_clock::now();
-        #endif
+          #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+          auto start_device_hessians_time = std::chrono::steady_clock::now();
+          #endif
 
-        //const data_size_t* indices = data_partition_->indices();
-        //data_size_t cnt = data_partition_->leaf_count(0);
+          // const data_size_t* indices = data_partition_->indices();
+          // data_size_t cnt = data_partition_->leaf_count(0);
 
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+          CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
 
-        CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
+          CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
 
-        #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
-        device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time;
-        #endif
+          #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+          device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time;
+          #endif
 
-        Log::Debug("queued copy of device_hessians_");
-      }
+          Log::Debug("queued copy of device_hessians_");
+        }
 
-      #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
-      auto start_device_gradients_time = std::chrono::steady_clock::now();
-      #endif
+        #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+        auto start_device_gradients_time = std::chrono::steady_clock::now();
+        #endif
 
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
+        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+        CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
 
-      #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
-      device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time;
-      #endif
+        #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+        device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time;
+        #endif
 
-      Log::Debug("CUDATreeLearner::BeforeTrain: issued gradients_ -> device_gradients_");
-   }
-  }
+        Log::Debug("CUDATreeLearner::BeforeTrain: issued gradients_ -> device_gradients_");
+      }
+    }
   }
 
 #if 0
@@ -740,33 +730,33 @@ void CUDATreeLearner::BeforeTrain() {
 #endif
 
   // use bagging
-  if ((hessians_ != NULL) && (gradients_ != NULL)){
-  if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
+  if ((hessians_ != NULL) && (gradients_ != NULL)) {
+    if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
 
-    // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram()
-    // copy used gradients and hessians to ordered buffer
+      // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram()
+      // copy used gradients and hessians to ordered buffer
 
-    const data_size_t* indices = data_partition_->indices();
-    data_size_t cnt = data_partition_->leaf_count(0);
+      const data_size_t* indices = data_partition_->indices();
+      data_size_t cnt = data_partition_->leaf_count(0);
 
-    // transfer the indices to GPU
-    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+      // transfer the indices to GPU
+      for (int device_id = 0; device_id < num_gpu_; ++device_id) {
 
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
+        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id]));
+        CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
 
-      if (!is_constant_hessian_) {
+        if (!is_constant_hessian_) {
 
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], (void *) &(hessians_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-        CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
+          CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], (void *) &(hessians_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+          CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
 
-      }
+        }
 
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], (void *) &(gradients_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
+        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], (void *) &(gradients_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+        CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
+      }
     }
   }
-  }
 }
 
 bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
@@ -798,7 +788,7 @@ bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int r
     #if GPU_DEBUG >= 2
     #endif
 
-    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+    for (int device_id = 0; device_id < num_gpu_; ++device_id) {
       CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], &indices[begin], (end-begin) * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
       CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
     }
@@ -813,12 +803,10 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
   const std::vector<int8_t>& is_feature_used,
   const data_size_t* data_indices, data_size_t num_data) {
 
-
   if (num_data <= 0) {
     return false;
   }
 
-
   // do nothing if no features can be processed on GPU
   if (!num_dense_feature_groups_) {
     Log::Debug("no dense feature groups, returning");
@@ -828,7 +816,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
   // copy data indices if it is not null
   if (data_indices != nullptr && num_data != num_data_) {
 
-    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
+    for (int device_id = 0; device_id < num_gpu_; ++device_id) {
 
       CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], data_indices, num_data * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
       CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
@@ -841,9 +829,9 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
 
   #pragma omp parallel for schedule(static,1024) if (num_features_ >= 2048)
   for (int i = 0; i < num_features_; ++i) {
-    if(is_feature_used[i]) { 
-      int feature_group = train_data_->Feature2Group(i); // LGBM_CUDA
-      is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group)<=16)? 2 : 1; // LGBM_CUDA
+    if (is_feature_used[i]) { 
+      int feature_group = train_data_->Feature2Group(i);  // LGBM_CUDA
+      is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group)<=16)? 2 : 1;  // LGBM_CUDA
     }
   }
 
@@ -855,8 +843,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
       //feature_masks_[i] = 1;
       feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]];
       ++used_dense_feature_groups;
-    }
-    else {
+    } else {
       feature_masks_[i] = 0;
     }
   }
@@ -872,11 +859,11 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
 
   // LGBM_CUDA We now copy even if all features are used.
 
-    #pragma omp parallel for schedule(static, num_gpu_)
-    for(int device_id = 0; device_id < num_gpu_; ++device_id) {
-      int offset = offset_gpu_feature_groups_[device_id];
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id]));
-    }
+  #pragma omp parallel for schedule(static, num_gpu_)
+  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
+    int offset = offset_gpu_feature_groups_[device_id];
+    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id]));
+  }
 
   // All data have been prepared, now run the GPU kernel
 
@@ -902,8 +889,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
       is_sparse_feature_used[feature_index] = 1;
       num_sparse_features++;
-    }
-    else {
+    } else {
       is_dense_feature_used[feature_index] = 1;
       num_dense_features++;
     }
@@ -916,7 +902,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
 
   // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU
-  if (exp_workgroups_per_feature == 0){
+  if (exp_workgroups_per_feature == 0) {
     return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
   }
 
@@ -926,7 +912,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 
   // then construct sparse features on CPU
   // We set data_indices to null to avoid rebuilding ordered gradients/hessians
-  if (num_sparse_features > 0){
+  if (num_sparse_features > 0) {
     train_data_->ConstructHistograms(is_sparse_feature_used,
     smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
     gradients_, hessians_,
@@ -940,8 +926,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     if (config_->gpu_use_dp) {
       // use double precision
       WaitAndGetHistograms<hist_t>(smaller_leaf_histogram_array_);
-    }
-    else {
+    } else {
       // use single precision
       WaitAndGetHistograms<gpu_hist_t>(smaller_leaf_histogram_array_);
     }
@@ -1011,13 +996,12 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
     if (num_data == num_data_) {
         printf("CompareHistograms reports %d errors\n", retval);
-    }
-    else {
+    } else {
         printf("CompareHistograms reports %d errors\n", retval);
     }
     std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
     delete [] gpu_histogram;
-    //break; // LGBM_CUDA: see only first feature info
+    //break;  // LGBM_CUDA: see only first feature info
   }
   printf("End Comparing Histogram between GPU and CPU\n");
   fflush(stderr);
@@ -1037,7 +1021,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     // then construct sparse features on CPU
     // We set data_indices to null to avoid rebuilding ordered gradients/hessians
 
-    if (num_sparse_features > 0){
+    if (num_sparse_features > 0) {
     //train_data_->ConstructHistograms(is_sparse_feature_used,
     //  nullptr, larger_leaf_splits_->num_data_in_leaf(),
     //  larger_leaf_splits_->leaf_index(),
@@ -1058,8 +1042,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       if (config_->gpu_use_dp) {
         // use double precision
         WaitAndGetHistograms<hist_t>(larger_leaf_histogram_array_);
-      }
-      else {
+      } else {
         // use single precision
         WaitAndGetHistograms<gpu_hist_t>(larger_leaf_histogram_array_);
       }
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 7b256345c82..384ec57f66a 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -2,6 +2,12 @@
 #ifndef LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
 #define LGBM_TREELEARNET_CUDA_TREE_LEARNER_H_
 
+#include <cstdio>
+#include <vector>
+#include <random>
+#include <cmath>
+#include <memory>
+
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/dataset.h>
@@ -13,12 +19,6 @@
 #include "split_info.hpp"
 #include "leaf_splits.hpp"
 
-#include <cstdio>
-#include <vector>
-#include <random>
-#include <cmath>
-#include <memory>
-
 #ifdef USE_CUDA
 
 #include <LightGBM/cuda/vector_cudahost.h>
@@ -34,261 +34,257 @@ namespace LightGBM {
 * \brief CUDA-based parallel learning algorithm.
 */
 class CUDATreeLearner: public SerialTreeLearner {
-public:
-  explicit CUDATreeLearner(const Config* tree_config);
-  ~CUDATreeLearner();
-  // LGBM_CUDA: is_use_subset is used by CUDA only
-  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;
-  void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
-  Tree* Train(const score_t* gradients, const score_t *hessians,
-              bool is_constant_hessian, Json& forced_split_json) override;
+  public:
+    explicit CUDATreeLearner(const Config* tree_config);
+    ~CUDATreeLearner();
+    // LGBM_CUDA: is_use_subset is used by CUDA only
+    void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;
+    void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
+    Tree* Train(const score_t* gradients, const score_t *hessians,
+                bool is_constant_hessian, const Json& forced_split_json) override;
 
-  void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
-    SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
-    // determine if we are using bagging before we construct the data partition
-    // thus we can start data movement to GPU earlier
-    if (subset == nullptr && used_indices != nullptr) {
-      if (num_data != num_data_) {
-        use_bagging_ = true;
-        return;
+    void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
+      SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
+      // determine if we are using bagging before we construct the data partition
+      // thus we can start data movement to GPU earlier
+      if (subset == nullptr && used_indices != nullptr) {
+        if (num_data != num_data_) {
+          use_bagging_ = true;
+          return;
+        }
       }
+      use_bagging_ = false; 
     }
-    use_bagging_ = false; 
-  }
 
-protected:
-  void BeforeTrain() override;
-  bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
-  void FindBestSplits() override;
-  void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
-  void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
-private:
-  /*! \brief 4-byte feature tuple used by GPU kernels */
-  //struct Feature4 {
-  //    uint8_t s[4];
-  //};
+  protected:
+    void BeforeTrain() override;
+    bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
+    void FindBestSplits() override;
+    void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
+    void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
+  private:
+    /*! \brief 4-byte feature tuple used by GPU kernels */
+    //struct Feature4 {
+    //    uint8_t s[4];
+    //};
   
-  typedef float gpu_hist_t;
+    typedef float gpu_hist_t;
 
-  /*!
-  * \brief Find the best number of workgroups processing one feature for maximizing efficiency
-  * \param leaf_num_data The number of data examples on the current leaf being processed
-  * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
-  */
-  int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
+    /*!
+     * \brief Find the best number of workgroups processing one feature for maximizing efficiency
+     * \param leaf_num_data The number of data examples on the current leaf being processed
+     * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
+     */
+    int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
   
-  /*!
-  * \brief Initialize GPU device
-  * \LGBM_CUDA: param num_gpu: number of maximum gpus
-  */
-  void InitGPU(int num_gpu);
+    /*!
+     * \brief Initialize GPU device
+     * \LGBM_CUDA: param num_gpu: number of maximum gpus
+     */
+    void InitGPU(int num_gpu);
 
-  /*!
-  * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only
-  */
-  void CountDenseFeatureGroups(); // compute num_dense_feature_group
-  void prevAllocateGPUMemory(); // compute CPU-side param calculation & Pin HostMemory
-  void AllocateGPUMemory();
+    /*!
+     * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only
+     */
+    void CountDenseFeatureGroups();  // compute num_dense_feature_group
+    void prevAllocateGPUMemory();  // compute CPU-side param calculation & Pin HostMemory
+    void AllocateGPUMemory();
 
-  /*!
-  * \ LGBM_CUDA: ResetGPUMemory
-  */
-  void ResetGPUMemory();
+    /*!
+     * \ LGBM_CUDA: ResetGPUMemory
+     */
+    void ResetGPUMemory();
 
-  /*!
-  * \ LGBM_CUDA: copy dense feature from CPU to GPU
-  */
-  void copyDenseFeature();
+    /*!
+     * \ LGBM_CUDA: copy dense feature from CPU to GPU
+     */
+    void copyDenseFeature();
 
-
-  /*! 
-   * \brief Compute GPU feature histogram for the current leaf.
-   *        Indices, gradients and hessians have been copied to the device.
-   * \param leaf_num_data Number of data on current leaf
-   * \param use_all_features Set to true to not use feature masks, with a faster kernel
-  */
-  void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
+    /*! 
+     * \brief Compute GPU feature histogram for the current leaf.
+     *        Indices, gradients and hessians have been copied to the device.
+     * \param leaf_num_data Number of data on current leaf
+     * \param use_all_features Set to true to not use feature masks, with a faster kernel
+     */
+    void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
   
-  void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
-		int leaf_num_data, bool use_all_features, 
-		int num_workgroups, int exp_workgroups_per_feature) {
-    ThreadData* td = &thread_data[device_id];
-    td->device_id		= device_id;
-    td->histogram_size		= histogram_size;
-    td->leaf_num_data		= leaf_num_data;
-    td->num_data		= num_data_;
-    td->use_all_features	= use_all_features;
-    td->is_constant_hessian	= is_constant_hessian_;
-    td->num_workgroups		= num_workgroups;
-    td->stream			= stream_[device_id];
-    td->device_features		= device_features_[device_id];
-    td->device_feature_masks	= reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
-    td->device_data_indices	= reinterpret_cast<uint*>(device_data_indices_[device_id]);
-    td->device_gradients	= device_gradients_[device_id];
-    td->device_hessians		= device_hessians_[device_id];
-    td->hessians_const		= hessians_[0];
-    td->device_subhistograms	= device_subhistograms_[device_id];
-    td->sync_counters		= sync_counters_[device_id];
-    td->device_histogram_outputs= device_histogram_outputs_[device_id];
-    td->exp_workgroups_per_feature = exp_workgroups_per_feature;
+    void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
+                int leaf_num_data, bool use_all_features, 
+                int num_workgroups, int exp_workgroups_per_feature) {
+      ThreadData* td = &thread_data[device_id];
+      td->device_id             = device_id;
+      td->histogram_size        = histogram_size;
+      td->leaf_num_data         = leaf_num_data;
+      td->num_data              = num_data_;
+      td->use_all_features      = use_all_features;
+      td->is_constant_hessian   = is_constant_hessian_;
+      td->num_workgroups        = num_workgroups;
+      td->stream                = stream_[device_id];
+      td->device_features       = device_features_[device_id];
+      td->device_feature_masks  = reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
+      td->device_data_indices   = reinterpret_cast<uint*>(device_data_indices_[device_id]);
+      td->device_gradients      = device_gradients_[device_id];
+      td->device_hessians       = device_hessians_[device_id];
+      td->hessians_const        = hessians_[0];
+      td->device_subhistograms  = device_subhistograms_[device_id];
+      td->sync_counters         = sync_counters_[device_id];
+      td->device_histogram_outputs   = device_histogram_outputs_[device_id];
+      td->exp_workgroups_per_feature = exp_workgroups_per_feature;
     
-    td->kernel_start		= &(kernel_start_[device_id]);
-    td->kernel_wait_obj		= &(kernel_wait_obj_[device_id]);
-    td->kernel_input_wait_time  = &(kernel_input_wait_time_[device_id]);
+      td->kernel_start           = &(kernel_start_[device_id]);
+      td->kernel_wait_obj        = &(kernel_wait_obj_[device_id]);
+      td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]);
 
-    size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-    size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-    td->output_size 		= output_size;
-    td->host_histogram_output	= (char*)host_histogram_outputs_ + host_output_offset;
-    td->histograms_wait_obj	= &(histograms_wait_obj_[device_id]);
-  }
+      size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
+      size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
+      td->output_size           = output_size;
+      td->host_histogram_output = (char*)host_histogram_outputs_ + host_output_offset;
+      td->histograms_wait_obj   = &(histograms_wait_obj_[device_id]);
+    }
 
+    // LGBM_CUDA: thread work
+    // typedef void * (*THREADFUNCPTR)(void *);
+    // void* launch_gpu_kernel(void *td);
  
-  // LGBM_CUDA: thread work
-  //typedef void * (*THREADFUNCPTR)(void *);
-  //void* launch_gpu_kernel(void *td);
- 
-  /*!
-   * \brief Wait for GPU kernel execution and read histogram
-   * \param histograms Destination of histogram results from GPU.
-  */
-  template <typename HistType>
-  void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array);
-
-  /*!
-   * \brief Construct GPU histogram asynchronously. 
-   *        Interface is similar to Dataset::ConstructHistograms().
-   * \param is_feature_used A predicate vector for enabling each feature
-   * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
-   *                     Set to nullptr to skip copy to GPU.
-   * \param num_data Number of data examples to be included in histogram
-   * \param gradients Array of gradients for all examples.
-   * \param hessians Array of hessians for all examples.
-   * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, 
-   *                     Set gradients to nullptr to skip copy to GPU.
-   * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, 
-   *                     Set hessians to nullptr to skip copy to GPU.
-   * \return true if GPU kernel is launched, false if GPU is not used
-  */
-  // LGBM_CUDA v5.2
-  bool ConstructGPUHistogramsAsync(
-    const std::vector<int8_t>& is_feature_used,
-    const data_size_t* data_indices, data_size_t num_data); 
+    /*!
+     * \brief Wait for GPU kernel execution and read histogram
+     * \param histograms Destination of histogram results from GPU.
+     */
+    template <typename HistType>
+    void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array);
 
+    /*!
+     * \brief Construct GPU histogram asynchronously. 
+     *        Interface is similar to Dataset::ConstructHistograms().
+     * \param is_feature_used A predicate vector for enabling each feature
+     * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
+     *                     Set to nullptr to skip copy to GPU.
+     * \param num_data Number of data examples to be included in histogram
+     * \param gradients Array of gradients for all examples.
+     * \param hessians Array of hessians for all examples.
+     * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, 
+     *                     Set gradients to nullptr to skip copy to GPU.
+     * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, 
+     *                     Set hessians to nullptr to skip copy to GPU.
+     * \return true if GPU kernel is launched, false if GPU is not used
+    */
+    // LGBM_CUDA v5.2
+    bool ConstructGPUHistogramsAsync(
+      const std::vector<int8_t>& is_feature_used,
+      const data_size_t* data_indices, data_size_t num_data); 
 
-  /*! brief Log2 of max number of workgroups per feature*/
-  const int kMaxLogWorkgroupsPerFeature = 10; // 2^10
-  /*! brief Max total number of workgroups with preallocated workspace.
-   *        If we use more than this number of workgroups, we have to reallocate subhistograms */
-  //int preallocd_max_num_wg_ = 1024;
-  std::vector<int> preallocd_max_num_wg_;
+    /*! brief Log2 of max number of workgroups per feature*/
+    const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10
+    /*! brief Max total number of workgroups with preallocated workspace.
+     *        If we use more than this number of workgroups, we have to reallocate subhistograms */
+    // int preallocd_max_num_wg_ = 1024;
+    std::vector<int> preallocd_max_num_wg_;
 
-  /*! \brief True if bagging is used */
-  bool use_bagging_;
+    /*! \brief True if bagging is used */
+    bool use_bagging_;
 
-  /*! \brief GPU device object */
-  //int* dev_;
-  /*! \brief GPU command queue object */
-  std::vector<cudaStream_t> stream_;
+    /*! \brief GPU device object */
+    // int* dev_;
+    /*! \brief GPU command queue object */
+    std::vector<cudaStream_t> stream_;
 
-  /*! \brief total number of feature-groups */
-  int num_feature_groups_;
-  /*! \brief total number of dense feature-groups, which will be processed on GPU */
-  int num_dense_feature_groups_;
-  std::vector<int> num_gpu_feature_groups_; // LGBM_CUDA
-  std::vector<int> offset_gpu_feature_groups_; // LGBM_CUDA
-  /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
-   *  With bin size > 16, there are 4 features per DWORD.
-   *  With bin size <=16, there are 8 features per DWORD.
-   * */
-  int dword_features_;
-  /*! \brief total number of dense feature-group tuples on GPU.
-   * Each feature tuple is 4-byte (4 features if each feature takes a byte) */
-  //int num_dense_feature4_;
-  /*! \brief Max number of bins of training data, used to determine 
-   * which GPU kernel to use */
-  int max_num_bin_;
-  /*! \brief Used GPU kernel bin size (64, 256) */
-  int histogram_size_;
-  int device_bin_size_;
-  /*! \brief Size of histogram bin entry, depending if single or double precision is used */
-  size_t hist_bin_entry_sz_;
-  /*! \brief Indices of all dense feature-groups */
-  std::vector<int> dense_feature_group_map_;
-  /*! \brief Indices of all sparse feature-groups */
-  std::vector<int> sparse_feature_group_map_;
-  /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */
-  //std::vector<int> device_bin_mults_;
-  /*! \brief GPU memory object holding the training data */
-  //uint8_t *device_features_;
-  std::vector<uint8_t*> device_features_;
-  /*! \brief GPU memory object holding the ordered gradient */
-  //score_t *device_gradients_;
-  std::vector<score_t*> device_gradients_;
-  /*! \brief Pointer to pinned memory of ordered gradient */
-  void * ptr_pinned_gradients_ = nullptr;
-  /*! \brief GPU memory object holding the ordered hessian */
-  //score_t *device_hessians_;
-  std::vector<score_t*> device_hessians_;
-  /*! \brief Pointer to pinned memory of ordered hessian */
-  void * ptr_pinned_hessians_ = nullptr;
-  /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
-//  std::vector<char, CHAllocator<char>> feature_masks_; 
-  std::vector<char> feature_masks_;
-  /*! \brief GPU memory object holding the feature masks */
-  //void *device_feature_masks_;
-  std::vector<char*> device_feature_masks_;
-  /*! \brief Pointer to pinned memory of feature masks */
-  char* ptr_pinned_feature_masks_ = nullptr;
-  /*! \brief GPU memory object holding indices of the leaf being processed */
-  //data_size_t *device_data_indices_;
-  std::vector<data_size_t*> device_data_indices_;
-  /*! \brief GPU memory object holding counters for workgroup coordination */
-  //int *sync_counters_;
-  std::vector<int*> sync_counters_;
-  /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
-  //char *device_subhistograms_;
-  std::vector<char*> device_subhistograms_;
-  /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
-  // FIXME: is this cuda mapped
-  //void *device_histogram_outputs_;
-  std::vector<void*> device_histogram_outputs_;
-  /*! \brief Host memory pointer for histogram outputs */
-  void *host_histogram_outputs_;
-  /*! \LGBM_CUDA: CUDA waitlist object for waiting for data transfer before kernel execution */
-  //cudaEvent_t kernel_wait_obj_;
-  std::vector<cudaEvent_t> kernel_wait_obj_;
-  /*! \LGBM_CUDA: CUDA waitlist object for reading output histograms after kernel execution */
-  //cudaEvent_t histograms_wait_obj_;
-  std::vector<cudaEvent_t> histograms_wait_obj_;
-  /*! \LGBM_CUDA: CUDA Asynchronous waiting object for copying indices */
-  //cudaEvent_t indices_future_;
-  std::vector<cudaEvent_t> indices_future_;
-  /*! \LGBM_CUDA: Asynchronous waiting object for copying gradients */
-  //cudaEvent_t gradients_future_;
-  std::vector<cudaEvent_t> gradients_future_;
-  /*! \LGBM_CUDA: Asynchronous waiting object for copying hessians */
-  //cudaEvent_t hessians_future_;
-  std::vector<cudaEvent_t> hessians_future_;
-  // LGBM_CUDA:\brief Asynchronous waiting object for copying dense features
-  //cudaEvent_t features_future_;
-  std::vector<cudaEvent_t> features_future_;
+    /*! \brief total number of feature-groups */
+    int num_feature_groups_;
+    /*! \brief total number of dense feature-groups, which will be processed on GPU */
+    int num_dense_feature_groups_;
+    std::vector<int> num_gpu_feature_groups_;  // LGBM_CUDA
+    std::vector<int> offset_gpu_feature_groups_;  // LGBM_CUDA
+    /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
+     *  With bin size > 16, there are 4 features per DWORD.
+     *  With bin size <=16, there are 8 features per DWORD.
+     */
+    int dword_features_;
+    /*! \brief total number of dense feature-group tuples on GPU.
+     * Each feature tuple is 4-byte (4 features if each feature takes a byte) */
+    // int num_dense_feature4_;
+    /*! \brief Max number of bins of training data, used to determine 
+     * which GPU kernel to use */
+    int max_num_bin_;
+    /*! \brief Used GPU kernel bin size (64, 256) */
+    int histogram_size_;
+    int device_bin_size_;
+    /*! \brief Size of histogram bin entry, depending if single or double precision is used */
+    size_t hist_bin_entry_sz_;
+    /*! \brief Indices of all dense feature-groups */
+    std::vector<int> dense_feature_group_map_;
+    /*! \brief Indices of all sparse feature-groups */
+    std::vector<int> sparse_feature_group_map_;
+    /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */
+    //std::vector<int> device_bin_mults_;
+    /*! \brief GPU memory object holding the training data */
+    //uint8_t *device_features_;
+    std::vector<uint8_t*> device_features_;
+    /*! \brief GPU memory object holding the ordered gradient */
+    //score_t *device_gradients_;
+    std::vector<score_t*> device_gradients_;
+    /*! \brief Pointer to pinned memory of ordered gradient */
+    void * ptr_pinned_gradients_ = nullptr;
+    /*! \brief GPU memory object holding the ordered hessian */
+    // score_t *device_hessians_;
+    std::vector<score_t*> device_hessians_;
+    /*! \brief Pointer to pinned memory of ordered hessian */
+    void * ptr_pinned_hessians_ = nullptr;
+    /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
+    // std::vector<char, CHAllocator<char>> feature_masks_; 
+    std::vector<char> feature_masks_;
+    /*! \brief GPU memory object holding the feature masks */
+    //void *device_feature_masks_;
+    std::vector<char*> device_feature_masks_;
+    /*! \brief Pointer to pinned memory of feature masks */
+    char* ptr_pinned_feature_masks_ = nullptr;
+    /*! \brief GPU memory object holding indices of the leaf being processed */
+    // data_size_t *device_data_indices_;
+    std::vector<data_size_t*> device_data_indices_;
+    /*! \brief GPU memory object holding counters for workgroup coordination */
+    // int *sync_counters_;
+    std::vector<int*> sync_counters_;
+    /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
+    // char *device_subhistograms_;
+    std::vector<char*> device_subhistograms_;
+    /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
+    // FIXME: is this cuda mapped
+    // void *device_histogram_outputs_;
+    std::vector<void*> device_histogram_outputs_;
+    /*! \brief Host memory pointer for histogram outputs */
+    void *host_histogram_outputs_;
+    /*! \LGBM_CUDA: CUDA waitlist object for waiting for data transfer before kernel execution */
+    // cudaEvent_t kernel_wait_obj_;
+    std::vector<cudaEvent_t> kernel_wait_obj_;
+    /*! \LGBM_CUDA: CUDA waitlist object for reading output histograms after kernel execution */
+    // cudaEvent_t histograms_wait_obj_;
+    std::vector<cudaEvent_t> histograms_wait_obj_;
+    /*! \LGBM_CUDA: CUDA Asynchronous waiting object for copying indices */
+    // cudaEvent_t indices_future_;
+    std::vector<cudaEvent_t> indices_future_;
+    /*! \LGBM_CUDA: Asynchronous waiting object for copying gradients */
+    // cudaEvent_t gradients_future_;
+    std::vector<cudaEvent_t> gradients_future_;
+    /*! \LGBM_CUDA: Asynchronous waiting object for copying hessians */
+    // cudaEvent_t hessians_future_;
+    std::vector<cudaEvent_t> hessians_future_;
+    // LGBM_CUDA:\brief Asynchronous waiting object for copying dense features
+    // cudaEvent_t features_future_;
+    std::vector<cudaEvent_t> features_future_;
 
-  // LGBM_CUDA: use subset of training data for bagging
-  bool is_use_subset_;
+    // LGBM_CUDA: use subset of training data for bagging
+    bool is_use_subset_;
 
-  // LGBM_CUDA: host-side buffer for converting feature data into featre4 data
-  //std::vector<uint8_t*> host_vecs_;
-  int nthreads_; // number of Feature4* vector on host4_vecs_
-  //cudaEvent_t kernel_start_; // event for kernel start
-  std::vector<cudaEvent_t> kernel_start_;
-  std::vector<float> kernel_time_; // measure histogram kernel time
-  std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
-  int num_gpu_;
-  int allocated_num_data_; // allocated data instances
-  pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu
+    // LGBM_CUDA: host-side buffer for converting feature data into featre4 data
+    // std::vector<uint8_t*> host_vecs_;
+    int nthreads_;  // number of Feature4* vector on host4_vecs_
+    // cudaEvent_t kernel_start_;  // event for kernel start
+    std::vector<cudaEvent_t> kernel_start_;
+    std::vector<float> kernel_time_;  // measure histogram kernel time
+    std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
+    int num_gpu_;
+    int allocated_num_data_;  // allocated data instances
+    pthread_t **cpu_threads_;  // pthread, 1 cpu thread / gpu
 };
-
 }  // namespace LightGBM
 #else // USE_CUDA
 
@@ -297,15 +293,15 @@ class CUDATreeLearner: public SerialTreeLearner {
 namespace LightGBM {
     
 class CUDATreeLearner: public SerialTreeLearner {
-public:
-  #pragma warning(disable : 4702)
-  explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
-    Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
-               "Please recompile with CMake option -DUSE_CUDA=1");
-  }
+  public:
+    #pragma warning(disable : 4702)
+    explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
+      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
+                 "Please recompile with CMake option -DUSE_CUDA=1");
+    }
 };
 
 }
 
-#endif //USE_CUDA
+#endif // USE_CUDA
 #endif // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index 0624bb96249..31425c77cd3 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -20,9 +20,9 @@ DataParallelTreeLearner<TREELEARNER_T>::~DataParallelTreeLearner() {
 }
 
 template <typename TREELEARNER_T>
-void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { //LGBM_CUDA
+void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {  // LGBM_CUDA
   // initialize SerialTreeLearner
-  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); //LGBM_CUDA
+  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset);  // LGBM_CUDA
   // Get local rank and global machine size
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
@@ -256,7 +256,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, in
 }
 
 // instantiate template classes, otherwise linker cannot find the code
-template class DataParallelTreeLearner<CUDATreeLearner>; // LGBM_CUDA
+template class DataParallelTreeLearner<CUDATreeLearner>;  // LGBM_CUDA
 template class DataParallelTreeLearner<GPUTreeLearner>;
 template class DataParallelTreeLearner<SerialTreeLearner>;
 
diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp
index 5cf660ab9c9..5a820328ddb 100644
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -19,9 +19,9 @@ template <typename TREELEARNER_T>
 FeatureParallelTreeLearner<TREELEARNER_T>::~FeatureParallelTreeLearner() {
 }
 
-template <typename TREELEARNER_T> //LGBM_CUDA
+template <typename TREELEARNER_T> // LGBM_CUDA
 void FeatureParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
-  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); //LGBM_CUDA
+  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset);  // LGBM_CUDA
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
 
@@ -77,7 +77,7 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
 }
 
 // instantiate template classes, otherwise linker cannot find the code
-template class FeatureParallelTreeLearner<CUDATreeLearner>; // LGBM_CUDA
+template class FeatureParallelTreeLearner<CUDATreeLearner>;  // LGBM_CUDA
 template class FeatureParallelTreeLearner<GPUTreeLearner>;
 template class FeatureParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index fad02e1c044..7fb2a340a65 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -735,7 +735,7 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
 }
 
 Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians,
-                           bool is_constant_hessian, Json& forced_split_json) {
+                           bool is_constant_hessian, const Json& forced_split_json) {
   return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json);
 }
 
diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h
index 598e8d40ac9..2ed29bcd1f7 100644
--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -49,7 +49,7 @@ class GPUTreeLearner: public SerialTreeLearner {
   void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
   void ResetIsConstantHessian(bool is_constant_hessian) override;
   Tree* Train(const score_t* gradients, const score_t *hessians,
-              bool is_constant_hessian, Json& forced_split_json) override;
+              bool is_constant_hessian, const Json& forced_split_json) override;
 
   void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
     SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index a0780f913c9..7831159160b 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -1,8 +1,8 @@
-/*
+/*!
  * ibmGBT: IBM CUDA Accelerated LightGBM
  *
  * IBM Confidential
- * (C) Copyright IBM Corp. 2019. All Rights Reserved.
+ * Copyright (c) 2019 IBM Corporation. All rights reserved.
  *
  * The source code for this program is not published or otherwise
  * divested of its trade secrets, irrespective of what has been
@@ -15,7 +15,7 @@
 #include "histogram_16_64_256.hu"
 #include "stdio.h"
 
-#define PRINT(b,t,fmt,...) \
+#define PRINT(b, t, fmt, ...) \
 if (b == gtid && t == ltid) { \
   printf(fmt, __VA_ARGS__); \
 }
@@ -132,7 +132,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
      const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
      const ushort ltid = threadIdx.x;
-     const ushort lsize = NUM_BINS; // get_local_size(0);
+     const ushort lsize = NUM_BINS;  // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
      // local memory per workgroup is 3 KB
@@ -185,7 +185,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      if (!feature_mask) {
          return;
      } else {
-         feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+         feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
      }
 
      // STAGE 1: read feature data, and gradient and hessian
@@ -291,7 +291,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #if CONST_HESSIAN == 1
      // make a final reduction
      gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position 
+     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position 
      __syncthreads();
      #endif
 
@@ -358,7 +358,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // only 1 work group, no need to increase counter
      // the reduction will become a simple copy
      if (1) {
-         uint old_val; // dummy
+         uint old_val;  // dummy
  #endif
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
@@ -486,7 +486,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
      const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
      const ushort ltid = threadIdx.x;
-     const ushort lsize = NUM_BINS; // get_local_size(0);
+     const ushort lsize = NUM_BINS;  // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
      // local memory per workgroup is 3 KB
@@ -539,7 +539,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      if (!feature_mask) {
          return;
      } else {
-         feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+         feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
      }
 
      // STAGE 1: read feature data, and gradient and hessian
@@ -645,7 +645,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #if CONST_HESSIAN == 1
      // make a final reduction
      gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position 
+     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position 
      __syncthreads();
      #endif
 
@@ -712,7 +712,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // only 1 work group, no need to increase counter
      // the reduction will become a simple copy
      if (1) {
-         uint old_val; // dummy
+         uint old_val;  // dummy
  #endif
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
@@ -841,7 +841,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
      const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
      const ushort ltid = threadIdx.x;
-     const ushort lsize = NUM_BINS; // get_local_size(0);
+     const ushort lsize = NUM_BINS;  // get_local_size(0);
      const ushort group_id = blockIdx.x;
 
      // local memory per workgroup is 3 KB
@@ -893,7 +893,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      if (!feature_mask) {
          return;
      } else {
-         feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+         feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
      }
 
      // STAGE 1: read feature data, and gradient and hessian
@@ -905,7 +905,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      ushort bin;
 
      feature = feature_data[ind >> feature_mask];
-	     if (feature_mask) {
+             if (feature_mask) {
         feature = (feature >> ((ind & 1) << 2)) & 0xf;
      }
      bin = feature;
@@ -997,7 +997,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      #if CONST_HESSIAN == 1
      // make a final reduction
      gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position 
+     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position 
      __syncthreads();
      #endif
 
@@ -1064,7 +1064,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
      // only 1 work group, no need to increase counter
      // the reduction will become a simple copy
      if (1) {
-         uint old_val; // dummy
+         uint old_val;  // dummy
  #endif
          // locate our feature's block in output memory
          uint output_offset = (feature_id << power_feature_workgroups);
diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
index 1a875588cc4..4dfcb9f7895 100644
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ b/src/treelearner/kernels/histogram_16_64_256.hu
@@ -1,8 +1,8 @@
-/*
+/*!
  * ibmGBT: IBM CUDA Accelerated LightGBM
  *
  * IBM Confidential
- * (C) Copyright IBM Corp. 2019. All Rights Reserved.
+ * Copyright (c) 2019 IBM Corporation. All rights reserved.
  *
  * The source code for this program is not published or otherwise
  * divested of its trade secrets, irrespective of what has been
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 2fdf542d421..222955a3c94 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -28,7 +28,7 @@ class FeatureParallelTreeLearner: public TREELEARNER_T {
  public:
   explicit FeatureParallelTreeLearner(const Config* config);
   ~FeatureParallelTreeLearner();
-  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA
+  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;  // LGBM_CUDA
 
  protected:
   void BeforeTrain() override;
@@ -55,7 +55,7 @@ class DataParallelTreeLearner: public TREELEARNER_T {
  public:
   explicit DataParallelTreeLearner(const Config* config);
   ~DataParallelTreeLearner();
-  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA
+  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;  // LGBM_CUDA
   void ResetConfig(const Config* config) override;
 
  protected:
@@ -109,7 +109,7 @@ class VotingParallelTreeLearner: public TREELEARNER_T {
  public:
   explicit VotingParallelTreeLearner(const Config* config);
   ~VotingParallelTreeLearner() { }
-  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; //LGBM_CUDA
+  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;  // LGBM_CUDA
   void ResetConfig(const Config* config) override;
 
  protected:
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index e5b6626a6bd..3fdcbca23a0 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -25,9 +25,9 @@ SerialTreeLearner::SerialTreeLearner(const Config* config)
 SerialTreeLearner::~SerialTreeLearner() {
 }
 
-//LGBM_CUDA
+// LGBM_CUDA
 void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
-  (void)is_use_subset;	// UNUSED
+  (void)is_use_subset;  // UNUSED
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   num_features_ = train_data_->num_features();
@@ -150,7 +150,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
   constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves));
 }
 
-Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) {
+Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) {
   Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
   gradients_ = gradients;
   hessians_ = hessians;
@@ -441,7 +441,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
   }
 }
 
-int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
+int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf,
                                        int* right_leaf, int *cur_depth,
                                        bool *aborted_last_force_split) {
   (void)aborted_last_force_split;
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 367c262192c..23fc75659ad 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -80,7 +80,7 @@ class SerialTreeLearner: public TreeLearner {
   }
 
   Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian,
-              Json& forced_split_json) override;
+              const Json& forced_split_json) override;
 
   Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
 
@@ -163,7 +163,7 @@ class SerialTreeLearner: public TreeLearner {
                   bool update_cnt);
 
   /* Force splits with forced_split_json dict and then return num splits forced.*/
-  virtual int32_t ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
+  virtual int32_t ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf,
                               int* right_leaf, int* cur_depth,
                               bool *aborted_last_force_split);
 
@@ -209,11 +209,11 @@ class SerialTreeLearner: public TreeLearner {
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
-#elif USE_CUDA //LGBM_CUDA
+#elif USE_CUDA // LGBM_CUDA
   /*! \brief gradients of current iteration, ordered for cache optimized */
-  std::vector<score_t,CHAllocator<score_t>> ordered_gradients_;
+  std::vector<score_t, CHAllocator<score_t>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized */
-  std::vector<score_t,CHAllocator<score_t>> ordered_hessians_;
+  std::vector<score_t, CHAllocator<score_t>> ordered_hessians_;
 #else
   /*! \brief gradients of current iteration, ordered for cache optimized */
   std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_gradients_;
diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index df7231e91df..d47b469f950 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -5,7 +5,7 @@
 #include <LightGBM/tree_learner.h>
 
 #include "gpu_tree_learner.h"
-#include "cuda_tree_learner.h" // LGBM_CUDA
+#include "cuda_tree_learner.h"  // LGBM_CUDA
 #include "parallel_tree_learner.h"
 #include "serial_tree_learner.h"
 
@@ -32,7 +32,7 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
     } else if (learner_type == std::string("voting")) {
       return new VotingParallelTreeLearner<GPUTreeLearner>(config);
     }
-  } else if (device_type == std::string("cuda")) { // LGBM_CUDA
+  } else if (device_type == std::string("cuda")) {  // LGBM_CUDA
     if (learner_type == std::string("serial")) {
       return new CUDATreeLearner(config);
     } else if (learner_type == std::string("feature")) {
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 58f5b88d6b0..4b120975c26 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -19,8 +19,8 @@ VotingParallelTreeLearner<TREELEARNER_T>::VotingParallelTreeLearner(const Config
 }
 
 template <typename TREELEARNER_T>
-void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { // LGBM_CUDA
-  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA
+void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {  // LGBM_CUDA
+  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset);  // LGBM_CUDA
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
 
@@ -454,7 +454,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
 }
 
 // instantiate template classes, otherwise linker cannot find the code
-template class VotingParallelTreeLearner<CUDATreeLearner>; // LGBM_CUDA
+template class VotingParallelTreeLearner<CUDATreeLearner>;  // LGBM_CUDA
 template class VotingParallelTreeLearner<GPUTreeLearner>;
 template class VotingParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM

From bdcbeaa03563e330eb3935df2fc255134629bddc Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 15 Jun 2020 23:18:58 +0000
Subject: [PATCH 073/119] Another large lint cleanup - more to come.

---
 include/LightGBM/cuda/cuda_utils.h            |   2 +-
 include/LightGBM/cuda/vector_cudahost.h       |  16 +-
 src/boosting/gbdt.cpp                         |  51 ++---
 src/boosting/gbdt.h                           |   4 +-
 src/c_api.cpp                                 |   2 +-
 src/io/config_auto.cpp                        |   3 +-
 src/io/dataset.cpp                            |  24 +--
 src/io/dense_bin.hpp                          |  12 +-
 src/treelearner/cuda_kernel_launcher.cu       | 185 +++++++++---------
 src/treelearner/cuda_tree_learner.cpp         |  64 +++---
 src/treelearner/cuda_tree_learner.h           |  36 ++--
 .../feature_parallel_tree_learner.cpp         |   2 +-
 src/treelearner/serial_tree_learner.cpp       |   9 +-
 src/treelearner/serial_tree_learner.h         |   2 +-
 14 files changed, 195 insertions(+), 217 deletions(-)

diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index 7ff7b28c8f1..2fb45384f0c 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -14,7 +14,7 @@
 #include <stdio.h>
 
 #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
   if (code != cudaSuccess) {
     LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line);
     if (abort) exit(code);
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index b964fa4ad1f..41a27c349bd 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -22,9 +22,9 @@ namespace LightGBM {
 #define use_cuda_learner 2
 
 class LGBM_config_ {
-  public:
-    static int current_device;  // Default: lgbm_device_cpu 
-    static int current_learner;  // Default: use_cpu_learner
+ public:
+  static int current_device;  // Default: lgbm_device_cpu
+  static int current_learner;  // Default: use_cpu_learner
 };
 
 }  // namespace LightGBM
@@ -48,9 +48,9 @@ struct CHAllocator {
       } else {
         ptr = reinterpret_cast<T*>(malloc(n*sizeof(T)));
       }
-   #else
+    #else
       ptr = reinterpret_cast<T*>(malloc(n*sizeof(T)));
-   #endif
+    #endif
     return ptr;
   }
 
@@ -60,17 +60,17 @@ struct CHAllocator {
     #ifdef USE_CUDA
       if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) {
         cudaPointerAttributes attributes;
-        cudaPointerGetAttributes (&attributes, p);
+        cudaPointerGetAttributes(&attributes, p);
         if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
           cudaFreeHost(p);
         }
-      } else { 
+      } else {
         free(p);
       }
     #else
       free(p);
     #endif
- }
+  }
 };
 template <class T, class U>
 bool operator==(const CHAllocator<T>&, const CHAllocator<U>&);
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index d102f6aedc1..baeacc6beb9 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -128,8 +128,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   if (config_->device_type == std::string("cuda")) {
     if (is_use_subset_) {
       tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_);
-    }
-    else {
+    } else {
       tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_);
     }
   } else {
@@ -262,7 +261,7 @@ void GBDT::Bagging(int iter) {
       tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
     } else {  // LGBM_CUDA
       // NEW get subset
-      bool resized= tmp_subset_->ReSize(bag_data_cnt_);
+      bool resized = tmp_subset_->ReSize(bag_data_cnt_);
 
      if (resized && (config_->device_type == std::string("cuda"))) {
         size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
@@ -284,7 +283,7 @@ void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
   Common::FunctionTimer fun_timer("GBDT::Train", global_timer);
   bool is_finished = false;
 
- // LGBM_CUDA
+  // LGBM_CUDA
   auto start_time = std::chrono::steady_clock::now();
 
   for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) {
@@ -379,13 +378,11 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
 
 // LGBM_CUDA
 bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
+  // LGBM_CUDA invoke bagging during the first iteration
+  if (config_->device_type == std::string("cuda") && (iter_ == 0)) {
+    // auto start_time = std::chrono::steady_clock::now();
 
- // LGBM_CUDA invoke bagging during the first iteration
- if (config_->device_type == std::string("cuda") && (iter_ == 0)) {
-
-//    auto start_time = std::chrono::steady_clock::now();
-
-    Bagging(iter_); 
+    Bagging(iter_);
   }
 
   std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
@@ -397,7 +394,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
     }
 
     // LGBM_CUDA
-//    auto start_time = std::chrono::steady_clock::now();
+    // auto start_time = std::chrono::steady_clock::now();
 
     Boosting();
 
@@ -406,11 +403,10 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
   }
 
   // LGBM_CUDA  bagging logic
-  // Bagging(iter_); 
+  // Bagging(iter_);
 
   bool should_continue = false;
   for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
-
     // LGBM_CUDA
 //    auto start_time = std::chrono::steady_clock::now();
 
@@ -418,7 +414,6 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
     std::unique_ptr<Tree> new_tree(new Tree(2));
 
     if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) {
-
       auto grad = gradients + offset;
       auto hess = hessians + offset;
 
@@ -434,8 +429,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
 
       // need to copy gradients for bagging subset.
       if (is_use_subset_ && bag_data_cnt_ < num_data_) {
-      
-        #pragma omp parallel for schedule(static) // LGBM_CUDA
+        #pragma omp parallel for schedule(static)  // LGBM_CUDA
         for (int i = 0; i < bag_data_cnt_; ++i) {
           tmp_grad[i] = grad[bag_data_indices_[i]];  // LGBM_CUDA
           tmp_hess[i] = hess[bag_data_indices_[i]];  // LGBM_CUDA
@@ -482,11 +476,10 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
     int iter_next = iter_ + 1;
       if (iter_next < config_->num_iterations) {
 
-//       auto start_time = std::chrono::steady_clock::now();
+      // auto start_time = std::chrono::steady_clock::now();
 
        // bagging logic
        Bagging(iter_next);
-
       }
     }
     // add model
@@ -508,7 +501,6 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
 }
 
 bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
-
   if (config_->device_type == std::string("cuda")) {  // LGBM_CUDA
      return TrainOneIterCUDA(gradients, hessians);
   }
@@ -966,17 +958,16 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
   } else {
     bag_data_cnt_ = num_data_;
     if (config_->device_type == std::string("cuda")) {  // LGBM_CUDA
-       if (tmp_subset_ == nullptr){
-          tmp_subset_.reset(new Dataset(bag_data_cnt_));
-          tmp_subset_->CopyFeatureMapperFrom(train_data_);
-          size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-          tmp_gradients_.resize(total_size);
-          tmp_hessians_.resize(total_size);
-          is_use_subset_ = false;
-          bag_data_indices_.clear();
-       }
-    }
-    else {
+      if (tmp_subset_ == nullptr){
+        tmp_subset_.reset(new Dataset(bag_data_cnt_));
+        tmp_subset_->CopyFeatureMapperFrom(train_data_);
+        size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+        tmp_gradients_.resize(total_size);
+        tmp_hessians_.resize(total_size);
+        is_use_subset_ = false;
+        bag_data_indices_.clear();
+      }
+    } else {
       bag_data_indices_.clear();
       bagging_runner_.ReSize(0);
       is_use_subset_ = false;
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 99bf64a6fb0..d22b6687766 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -478,8 +478,8 @@ class GBDT : public GBDTBase {
 
 #ifdef USE_CUDA
   /*! \brief First order derivative of training data */
-  std::vector<score_t,CHAllocator<score_t>> gradients_;  // LGBM_CUDA
-  std::vector<score_t,CHAllocator<score_t>> tmp_gradients_;  // LGBM_CUDA
+  std::vector<score_t, CHAllocator<score_t>> gradients_;  // LGBM_CUDA
+  std::vector<score_t, CHAllocator<score_t>> tmp_gradients_;  // LGBM_CUDA
   /*! \brief Second order derivative of training data */
   std::vector<score_t, CHAllocator<score_t>> hessians_;  // LGBM_CUDA
   std::vector<score_t, CHAllocator<score_t>> tmp_hessians_;  // LGBM_CUDA
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 0ce92342fb6..6cdebc34aed 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -132,7 +132,7 @@ class Booster {
 
 #ifdef USE_CUDA
     // Only use CUDA when the data is large (2048 == 256 bins each with at least 8 elements)
-    if (train_data->num_data() < 2048){
+    if (train_data->num_data() < 2048) {
        config_.device_type = std::string("cpu");
     }
 #endif
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index b0cd57deb69..ba9c07cb547 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -618,9 +618,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   /* LGBM_CUDA get number of GPUs */
   GetInt(params, "num_gpu", &num_gpu);
-  CHECK(num_gpu > 0);
+  CHECK_GT(num_gpu, 0);
 #endif
-
 }
 
 std::string Config::SaveMembersToString() const {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index df18ef6f838..416d66695a3 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -239,14 +239,14 @@ std::vector<std::vector<int>> FindGroups(
 }
 
 std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
-                                                  int** sample_indices, 
-                                                  double** sample_values, 
+                                                  int** sample_indices,
+                                                  double** sample_values,
                                                   const int* num_per_col,
-                                                  int num_sample_col, 
+                                                  int num_sample_col,
                                                   data_size_t total_sample_cnt,
-                                                  const std::vector<int>& used_features, 
+                                                  const std::vector<int>& used_features,
                                                   data_size_t num_data,
-                                                  bool is_sparse, 
+                                                  bool is_sparse,
                                                   std::vector<int8_t>* multi_val_group,
                                                   bool is_use_gpu) {
   Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
@@ -355,15 +355,15 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
   std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
   if (io_config.enable_bundle && !used_features.empty()) {
     bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");  // LGBM_CUDA
-    features_in_group = FastFeatureBundling(*bin_mappers, 
-                                            sample_non_zero_indices, 
-                                            sample_values, 
+    features_in_group = FastFeatureBundling(*bin_mappers,
+                                            sample_non_zero_indices,
+                                            sample_values,
                                             num_per_col,
-                                            num_sample_col, 
+                                            num_sample_col,
                                             static_cast<data_size_t>(total_sample_cnt),
-                                            used_features, 
-                                            num_data_, 
-                                            io_config.is_enable_sparse, 
+                                            used_features,
+                                            num_data_,
+                                            io_config.is_enable_sparse,
                                             &group_is_multi_val,
                                             lgbm_is_gpu_used);
   }
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 89f29a99bdc..0eb37043842 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -464,16 +464,16 @@ class DenseBin : public Bin {
   DenseBin<VAL_T, IS_4BIT>* Clone() override;
 
  private:
-   data_size_t num_data_;
+  data_size_t num_data_;
 #ifdef USE_CUDA
-   std::vector<VAL_T, CHAllocator<VAL_T>> data_; // LGBM_CUDA
+  std::vector<VAL_T, CHAllocator<VAL_T>> data_; // LGBM_CUDA
 #else
-   std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
+  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
 #endif
-   std::vector<uint8_t> buf_;
+  std::vector<uint8_t> buf_;
 
-   DenseBin<VAL_T, IS_4BIT>(const DenseBin<VAL_T, IS_4BIT>& other)
-       : num_data_(other.num_data_), data_(other.data_) {}
+  DenseBin<VAL_T, IS_4BIT>(const DenseBin<VAL_T, IS_4BIT>& other)
+      : num_data_(other.num_data_), data_(other.data_) {}
 };
 
 template <typename VAL_T, bool IS_4BIT>
diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
index 6e3149dae06..f8c3effeb6c 100644
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
@@ -1,13 +1,17 @@
-   #ifdef USE_CUDA
-   
-   #include "cuda_kernel_launcher.h"
-   #include <cuda_runtime.h>
-   #include <cstdio>
-   #include <LightGBM/utils/log.h>
-   
-   using namespace LightGBM;
-   
-   void cuda_histogram(
+/*!
+ * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+#ifdef USE_CUDA
+
+#include "cuda_kernel_launcher.h"
+#include <cstdio>
+#include <cuda_runtime.h>
+#include <LightGBM/utils/log.h>
+
+using namespace LightGBM;
+
+void cuda_histogram(
                 int             histogram_size,
                 data_size_t     leaf_num_data,
                 data_size_t     num_data,
@@ -28,147 +32,136 @@
                 void*           arg9,
                 size_t          exp_workgroups_per_feature) {
    
-   if (histogram_size == 16) {
-      if (leaf_num_data == num_data) {
-         if (use_all_features) {
-            if (!is_constant_hessian) 
-               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+  if (histogram_size == 16) {
+    if (leaf_num_data == num_data) {
+      if (use_all_features) {
+        if (!is_constant_hessian) 
+          histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else 
-               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        else 
+           histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
-         else {   
-            if (!is_constant_hessian) 
-               histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+      } else {   
+        if (!is_constant_hessian) 
+           histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else 
-               histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        else 
+           histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
       }
-      else {
-         if (use_all_features) {
-            // seems all features is always enabled, so this should be the same as fulldata
-            if (!is_constant_hessian) 
-               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+    } else {
+      if (use_all_features) {
+        // seems all features is always enabled, so this should be the same as fulldata
+        if (!is_constant_hessian) 
+          histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else  
-               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        else  
+          histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
-         else {
-            if (!is_constant_hessian) 
-               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+      } else {
+        if (!is_constant_hessian) 
+          histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else 
+        else 
                histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
       }
-   }
-   else if (histogram_size == 64) {
-      if (leaf_num_data == num_data) {
-         if (use_all_features) {
-            if (!is_constant_hessian) 
-               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+    }
+  } else if (histogram_size == 64) {
+    if (leaf_num_data == num_data) {
+      if (use_all_features) {
+        if (!is_constant_hessian) 
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else 
-               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        else 
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
-         else {   
-            if (!is_constant_hessian) 
-               histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+      } else {   
+        if (!is_constant_hessian) 
+          histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else 
-               histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        else 
+          histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
       }
-      else {
-         if (use_all_features) {
-            // seems all features is always enabled, so this should be the same as fulldata
-            if (!is_constant_hessian)
-               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+    } else {
+      if (use_all_features) {
+        // seems all features is always enabled, so this should be the same as fulldata
+        if (!is_constant_hessian)
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else  
-               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        else  
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
-         else {
-            if (!is_constant_hessian) 
-               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+      } else {
+        if (!is_constant_hessian) 
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else 
-               histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        else 
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
       }
-   }
-   else {
-      if (leaf_num_data == num_data) {
-         if (use_all_features) {
-            if (!is_constant_hessian) 
-               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+    }
+  } else {
+    if (leaf_num_data == num_data) {
+      if (use_all_features) {
+        if (!is_constant_hessian) 
+          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else 
-               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+        else 
+          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
-         else {   
-            if (!is_constant_hessian) 
-               histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+      } else {   
+        if (!is_constant_hessian) 
+          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else 
-               histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+        else 
+          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
       }
-      else {
-         if (use_all_features) {
-            // seems all features is always enabled, so this should be the same as fulldata
-            if (!is_constant_hessian)
-               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+    } else {
+      if (use_all_features) {
+        // seems all features is always enabled, so this should be the same as fulldata
+        if (!is_constant_hessian)
+          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else  
-               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+        else  
+          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
-         else {
-            if (!is_constant_hessian) 
-               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+      } else {
+        if (!is_constant_hessian) 
+          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-            else 
-               histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+         else 
+          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         }
       }
-   }
+    }
+  }
 }
      
 #endif // USE_CUDA
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 0588ad14fe0..b61cddf4d3e 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -3,12 +3,13 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #ifdef USE_CUDA
+#include "cuda_tree_learner.h"
+
 #include <algorithm>
 #include <vector>
 
 #include <pthread.h>
 
-#include "cuda_tree_learner.h"
 #include "../io/dense_bin.hpp"
 
 #include <LightGBM/utils/array_args.h>
@@ -58,8 +59,11 @@ CUDATreeLearner::CUDATreeLearner(const Config* config)
   :SerialTreeLearner(config) {
   use_bagging_ = false;
   nthreads_ = 0;
-  if (config->gpu_use_dp && USE_DP_FLOAT) Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!");
-  else Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!");
+  if (config->gpu_use_dp && USE_DP_FLOAT) {
+    Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!");
+  } else {
+    Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!");
+  }
 }
 
 CUDATreeLearner::~CUDATreeLearner() {
@@ -67,13 +71,11 @@ CUDATreeLearner::~CUDATreeLearner() {
 
 
 void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
-
   // initialize SerialTreeLearner
   SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset);
 
   // some additional variables needed for GPU trainer
   num_feature_groups_ = train_data_->num_feature_groups();
-
  
   // LGBM_CUDA: use subset of training data for bagging
   is_use_subset_ = is_use_subset;  
@@ -107,11 +109,11 @@ union Float_t {
 int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) {
   int i;
   int retval = 0;
-  printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size);
+  printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, static_cast<int>(size));
   if (dp_flag) {  // double precision
     double af, bf;
-    long long int ai, bi;
-    for (i = 0; i < (int) size; ++i) {
+    int64 ai, bi;
+    for (i = 0; i < static_cast<int>(size); ++i) {
       af = GET_GRAD(h1, i);
       bf = GET_GRAD(h2, i);
       if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) {
@@ -119,8 +121,8 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
         ++retval;
       }
       if (const_flag) {
-        ai = GET_HESS(((long long int *) h1), i);
-        bi = GET_HESS(((long long int *) h2), i);
+        ai = GET_HESS(((int64 *) h1), i);
+        bi = GET_HESS(((int64 *) h2), i);
         if (ai != bi) {
           printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi);
           ++retval;
@@ -137,7 +139,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
   } else {  // single precision
     float af, bf;
     int ai, bi;
-    for (i = 0; i < (int) size; ++i) {
+    for (i = 0; i < static_cast<int>(size); ++i) {
       af = GET_GRAD(h1, i);
       bf = GET_GRAD(h2, i);
       if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) {
@@ -167,7 +169,6 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
 #endif
 
 int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
-
   // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
   // also guarantee that there are at least 2K examples per workgroup
 
@@ -177,9 +178,9 @@ int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
   double t = leaf_num_data / 1024.0;
 
   Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons"
-         "Best workgroup size per feature for full utilization is %d\n", (int)ceil(t), (1 << exp_workgroups_per_feature));
+         "Best workgroup size per feature for full utilization is %d\n", static_cast<int>(ceil(t)), (1 << exp_workgroups_per_feature));
 
-  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, (int)ceil(log((double)t)/log(2.0)));
+  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast<int>(ceil(log((double)t)/log(2.0))));
   if (exp_workgroups_per_feature < 0)
       exp_workgroups_per_feature = 0;
   if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature)
@@ -189,7 +190,6 @@ int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
 }
 
 void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) {
-
   // we have already copied ordered gradients, ordered hessians and indices to GPU
   // decide the best number of workgroups working on one feature4 tuple
   // set work group size based on feature size
@@ -198,7 +198,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
 
   int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
   std::vector<int> num_gpu_workgroups;
-  ThreadData *thread_data = (ThreadData*)malloc(sizeof(ThreadData) * num_gpu_);
+  ThreadData *thread_data = reinterpret_cast<ThreadData*>(malloc(sizeof(ThreadData) * num_gpu_));
 
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
@@ -209,13 +209,13 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
       CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
       CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)));
     }
-    //set thread_data
+    // set thread_data
     SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features,
                   num_workgroups, exp_workgroups_per_feature);
   }
- 
+
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))) {
+    if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, reinterpret_cast<void *>(&thread_data[device_id]))) {
         fprintf(stderr, "Error in creating threads. Exiting\n");
         exit(0);
     }
@@ -231,13 +231,12 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
   }
 
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-
     // copy the results asynchronously. Size depends on if double precision is used
 
     size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
     size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
 
-    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
+    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
     CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id]));
   }
 }
@@ -245,12 +244,11 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
 
 template <typename HistType>
 void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array) {
-  HistType* hist_outputs = (HistType*) host_histogram_outputs_;
+  HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
 
   #pragma omp parallel for schedule(static, num_gpu_)
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-
-//    auto start_time = std::chrono::steady_clock::now();
+    // auto start_time = std::chrono::steady_clock::now();
 
     // when the output is ready, the computation is done
     CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id]));
@@ -263,9 +261,9 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra
       continue;
     }
     int dense_group_index = dense_feature_group_map_[i];
-    //auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
+    // auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
     auto old_histogram_array = leaf_histogram_array[dense_group_index].RawData() - kHistOffset;
-    int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); 
+    int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
 
     for (int j = 0; j < bin_size; ++j) {
       GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
@@ -276,7 +274,6 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra
 
 // LGBM_CUDA
 void CUDATreeLearner::CountDenseFeatureGroups() {
-
   num_dense_feature_groups_ = 0;
 
   for (int i = 0; i < num_feature_groups_; ++i) {
@@ -291,7 +288,6 @@ void CUDATreeLearner::CountDenseFeatureGroups() {
 
 // LGBM_CUDA
 void CUDATreeLearner::prevAllocateGPUMemory() {
-
   // how many feature-group tuples we have
   // leave some safe margin for prefetching
   // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
@@ -325,12 +321,12 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
 #if 0
   // allocate feature mask, for disabling some feature-groups' histogram calculation
   if (feature_masks_.data() != NULL) {
-     cudaPointerAttributes attributes;
-     cudaPointerGetAttributes (&attributes, feature_masks_.data());
-    
-     if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { 
-        CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data()));
-     }
+    cudaPointerAttributes attributes;
+    cudaPointerGetAttributes(&attributes, feature_masks_.data());
+
+    if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { 
+      CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data()));
+    }
   }
 #endif
 
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 384ec57f66a..0375239049d 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -7,6 +7,9 @@
 #include <random>
 #include <cmath>
 #include <memory>
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#endif
 
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/array_args.h>
@@ -20,10 +23,8 @@
 #include "leaf_splits.hpp"
 
 #ifdef USE_CUDA
-
 #include <LightGBM/cuda/vector_cudahost.h>
-#include "cuda_kernel_launcher.h" // LGBM_CUDA
-#include <cuda_runtime.h>
+#include "cuda_kernel_launcher.h"  // LGBM_CUDA
 
 
 using namespace json11;
@@ -34,7 +35,7 @@ namespace LightGBM {
 * \brief CUDA-based parallel learning algorithm.
 */
 class CUDATreeLearner: public SerialTreeLearner {
-  public:
+ public:
     explicit CUDATreeLearner(const Config* tree_config);
     ~CUDATreeLearner();
     // LGBM_CUDA: is_use_subset is used by CUDA only
@@ -53,21 +54,20 @@ class CUDATreeLearner: public SerialTreeLearner {
           return;
         }
       }
-      use_bagging_ = false; 
+      use_bagging_ = false;
     }
 
-  protected:
+ protected:
     void BeforeTrain() override;
     bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
     void FindBestSplits() override;
     void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
     void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
-  private:
+ private:
     /*! \brief 4-byte feature tuple used by GPU kernels */
-    //struct Feature4 {
+    // struct Feature4 {
     //    uint8_t s[4];
-    //};
-  
+    // };
     typedef float gpu_hist_t;
 
     /*!
@@ -109,7 +109,7 @@ class CUDATreeLearner: public SerialTreeLearner {
     void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
   
     void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
-                int leaf_num_data, bool use_all_features, 
+                int leaf_num_data, bool use_all_features,
                 int num_workgroups, int exp_workgroups_per_feature) {
       ThreadData* td = &thread_data[device_id];
       td->device_id             = device_id;
@@ -171,7 +171,7 @@ class CUDATreeLearner: public SerialTreeLearner {
     // LGBM_CUDA v5.2
     bool ConstructGPUHistogramsAsync(
       const std::vector<int8_t>& is_feature_used,
-      const data_size_t* data_indices, data_size_t num_data); 
+      const data_size_t* data_indices, data_size_t num_data);
 
     /*! brief Log2 of max number of workgroups per feature*/
     const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10
@@ -215,12 +215,12 @@ class CUDATreeLearner: public SerialTreeLearner {
     /*! \brief Indices of all sparse feature-groups */
     std::vector<int> sparse_feature_group_map_;
     /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */
-    //std::vector<int> device_bin_mults_;
+    // std::vector<int> device_bin_mults_;
     /*! \brief GPU memory object holding the training data */
-    //uint8_t *device_features_;
+    // uint8_t *device_features_;
     std::vector<uint8_t*> device_features_;
     /*! \brief GPU memory object holding the ordered gradient */
-    //score_t *device_gradients_;
+    // score_t *device_gradients_;
     std::vector<score_t*> device_gradients_;
     /*! \brief Pointer to pinned memory of ordered gradient */
     void * ptr_pinned_gradients_ = nullptr;
@@ -230,10 +230,10 @@ class CUDATreeLearner: public SerialTreeLearner {
     /*! \brief Pointer to pinned memory of ordered hessian */
     void * ptr_pinned_hessians_ = nullptr;
     /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
-    // std::vector<char, CHAllocator<char>> feature_masks_; 
+    // std::vector<char, CHAllocator<char>> feature_masks_;
     std::vector<char> feature_masks_;
     /*! \brief GPU memory object holding the feature masks */
-    //void *device_feature_masks_;
+    // void *device_feature_masks_;
     std::vector<char*> device_feature_masks_;
     /*! \brief Pointer to pinned memory of feature masks */
     char* ptr_pinned_feature_masks_ = nullptr;
@@ -293,7 +293,7 @@ class CUDATreeLearner: public SerialTreeLearner {
 namespace LightGBM {
     
 class CUDATreeLearner: public SerialTreeLearner {
-  public:
+ public:
     #pragma warning(disable : 4702)
     explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
       Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp
index 5a820328ddb..3dde7f0f39b 100644
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -19,7 +19,7 @@ template <typename TREELEARNER_T>
 FeatureParallelTreeLearner<TREELEARNER_T>::~FeatureParallelTreeLearner() {
 }
 
-template <typename TREELEARNER_T> // LGBM_CUDA
+template <typename TREELEARNER_T>  // LGBM_CUDA
 void FeatureParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
   TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset);  // LGBM_CUDA
   rank_ = Network::rank();
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 3fdcbca23a0..96882732a92 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -336,11 +336,10 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) {
   bool use_subtract = parent_leaf_histogram_array_ != nullptr;
 
 #ifdef USE_CUDA
-  if (LGBM_config_::current_learner == use_cpu_learner){
-      SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); 
-  }
-  else{
-      ConstructHistograms(is_feature_used, use_subtract);
+  if (LGBM_config_::current_learner == use_cpu_learner) {
+    SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
+  } else {
+    ConstructHistograms(is_feature_used, use_subtract);
   }
 #else
   ConstructHistograms(is_feature_used, use_subtract);
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 23fc75659ad..668b54592e7 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -209,7 +209,7 @@ class SerialTreeLearner: public TreeLearner {
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
-#elif USE_CUDA // LGBM_CUDA
+#elif USE_CUDA  // LGBM_CUDA
   /*! \brief gradients of current iteration, ordered for cache optimized */
   std::vector<score_t, CHAllocator<score_t>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized */

From 930436c279caf5589c3c4ccbfebfac265c93344e Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 16 Jun 2020 13:54:24 +0000
Subject: [PATCH 074/119] Even more lint cleanup.

---
 src/boosting/gbdt.cpp                         |    9 +-
 src/io/dense_bin.hpp                          |    2 +-
 src/treelearner/cuda_kernel_launcher.cu       |  109 +-
 src/treelearner/cuda_kernel_launcher.h        |   12 +-
 src/treelearner/cuda_tree_learner.cpp         |  143 +-
 src/treelearner/cuda_tree_learner.h           |   28 +-
 .../kernels/histogram_16_64_256.cu            | 1544 ++++++++---------
 7 files changed, 908 insertions(+), 939 deletions(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index baeacc6beb9..b4c14a40e78 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -475,11 +475,10 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
     // LGBM_CUDA: moved for overlapping data copy w/ other operations
     int iter_next = iter_ + 1;
       if (iter_next < config_->num_iterations) {
+        // auto start_time = std::chrono::steady_clock::now();
 
-      // auto start_time = std::chrono::steady_clock::now();
-
-       // bagging logic
-       Bagging(iter_next);
+        // bagging logic
+        Bagging(iter_next);
       }
     }
     // add model
@@ -958,7 +957,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
   } else {
     bag_data_cnt_ = num_data_;
     if (config_->device_type == std::string("cuda")) {  // LGBM_CUDA
-      if (tmp_subset_ == nullptr){
+      if (tmp_subset_ == nullptr) {
         tmp_subset_.reset(new Dataset(bag_data_cnt_));
         tmp_subset_->CopyFeatureMapperFrom(train_data_);
         size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 0eb37043842..f0405bc318e 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -466,7 +466,7 @@ class DenseBin : public Bin {
  private:
   data_size_t num_data_;
 #ifdef USE_CUDA
-  std::vector<VAL_T, CHAllocator<VAL_T>> data_; // LGBM_CUDA
+  std::vector<VAL_T, CHAllocator<VAL_T>> data_;  // LGBM_CUDA
 #else
   std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
 #endif
diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
index f8c3effeb6c..8b243200878 100644
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
@@ -5,11 +5,9 @@
 #ifdef USE_CUDA
 
 #include "cuda_kernel_launcher.h"
-#include <cstdio>
-#include <cuda_runtime.h>
 #include <LightGBM/utils/log.h>
-
-using namespace LightGBM;
+#include <cuda_runtime.h>
+#include <cstdio>
 
 void cuda_histogram(
                 int             histogram_size,
@@ -31,46 +29,45 @@ void cuda_histogram(
                 volatile int*   arg8,
                 void*           arg9,
                 size_t          exp_workgroups_per_feature) {
-   
   if (histogram_size == 16) {
     if (leaf_num_data == num_data) {
       if (use_all_features) {
-        if (!is_constant_hessian) 
-          histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        if (!is_constant_hessian)
+          histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else 
-           histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        else
+           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {   
-        if (!is_constant_hessian) 
-           histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+      } else {
+        if (!is_constant_hessian)
+           histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else 
-           histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        else
+           histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     } else {
       if (use_all_features) {
         // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian) 
-          histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        if (!is_constant_hessian)
+          histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else  
-          histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
-        if (!is_constant_hessian) 
-          histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        if (!is_constant_hessian)
+          histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else 
-               histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
@@ -78,21 +75,21 @@ void cuda_histogram(
   } else if (histogram_size == 64) {
     if (leaf_num_data == num_data) {
       if (use_all_features) {
-        if (!is_constant_hessian) 
-          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        if (!is_constant_hessian)
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else 
-          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {   
-        if (!is_constant_hessian) 
-          histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+      } else {
+        if (!is_constant_hessian)
+          histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else 
-          histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
@@ -100,20 +97,20 @@ void cuda_histogram(
       if (use_all_features) {
         // seems all features is always enabled, so this should be the same as fulldata
         if (!is_constant_hessian)
-          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else  
-          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
-        if (!is_constant_hessian) 
-          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        if (!is_constant_hessian)
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else 
-          histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
@@ -121,21 +118,21 @@ void cuda_histogram(
   } else {
     if (leaf_num_data == num_data) {
       if (use_all_features) {
-        if (!is_constant_hessian) 
-          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+        if (!is_constant_hessian)
+          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else 
-          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {   
-        if (!is_constant_hessian) 
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+      } else {
+        if (!is_constant_hessian)
+          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else 
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
@@ -143,25 +140,25 @@ void cuda_histogram(
       if (use_all_features) {
         // seems all features is always enabled, so this should be the same as fulldata
         if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else  
-          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
-        if (!is_constant_hessian) 
-          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+        if (!is_constant_hessian)
+          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-         else 
-          histogram256<<<num_workgroups, 256, 0, stream>>>( arg0, arg1, arg2,
+        else
+          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
                   reinterpret_cast<const uint*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     }
   }
 }
-     
-#endif // USE_CUDA
+
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h
index efe8e4b0d4a..57c5f1bfc26 100644
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
@@ -7,8 +7,8 @@
 
 #ifdef USE_CUDA
 // what should I include??
-#include "kernels/histogram_16_64_256.hu"  // kernel, acc_type, data_size_t, uchar, score_t
 #include <chrono>
+#include "kernels/histogram_16_64_256.hu"  // kernel, acc_type, data_size_t, uchar, score_t
 
 struct ThreadData {
           // device id
@@ -46,10 +46,10 @@ struct ThreadData {
 
 void cuda_histogram(
                 int             histogram_size,
-                data_size_t     leaf_num_data, 
+                data_size_t     leaf_num_data,
                 data_size_t     num_data,
-                bool            use_all_features, 
-                bool            is_constant_hessian, 
+                bool            use_all_features,
+                bool            is_constant_hessian,
                 int             num_workgroups,
                 cudaStream_t    stream,
                 uint8_t*        arg0,
@@ -66,5 +66,5 @@ void cuda_histogram(
                 size_t          exp_workgroups_per_feature);
 
 
-#endif // USE_CUDA
-#endif // LGBM_KERNEL_LAUNCHER
+#endif  // USE_CUDA
+#endif  // LGBM_KERNEL_LAUNCHER
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index b61cddf4d3e..8d59e98f67f 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -5,19 +5,19 @@
 #ifdef USE_CUDA
 #include "cuda_tree_learner.h"
 
-#include <algorithm>
-#include <vector>
-
-#include <pthread.h>
-
-#include "../io/dense_bin.hpp"
-
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/network.h>
 #include <LightGBM/bin.h>
 
+#include <pthread.h>
+
 #include <LightGBM/cuda/cuda_utils.h>
 
+#include <algorithm>
+#include <vector>
+
+#include "../io/dense_bin.hpp"
+
 #define cudaMemcpy_DEBUG 0  // 1: DEBUG cudaMemcpy
 #define ResetTrainingData_DEBUG 0  // 1: Debug ResetTrainingData
 
@@ -76,9 +76,9 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian,
 
   // some additional variables needed for GPU trainer
   num_feature_groups_ = train_data_->num_feature_groups();
- 
+
   // LGBM_CUDA: use subset of training data for bagging
-  is_use_subset_ = is_use_subset;  
+  is_use_subset_ = is_use_subset;
 
   // Initialize GPU buffers and kernels & LGBM_CUDA: get device info
   InitGPU(config_->num_gpu);  // LGBM_CUDA
@@ -121,8 +121,8 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
         ++retval;
       }
       if (const_flag) {
-        ai = GET_HESS(((int64 *) h1), i);
-        bi = GET_HESS(((int64 *) h2), i);
+        ai = GET_HESS((reinterpret_cast<int64 *>(h1), i);
+        bi = GET_HESS((reinterpret_cast<int64 *>(h2), i);
         if (ai != bi) {
           printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi);
           ++retval;
@@ -174,13 +174,13 @@ int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
 
   double x = 256.0 / num_dense_feature_groups_;
 
-  int exp_workgroups_per_feature = (int)ceil(log2(x));
+  int exp_workgroups_per_feature = static_cast<int>(ceil(log2(x)));
   double t = leaf_num_data / 1024.0;
 
   Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons"
          "Best workgroup size per feature for full utilization is %d\n", static_cast<int>(ceil(t)), (1 << exp_workgroups_per_feature));
 
-  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast<int>(ceil(log((double)t)/log(2.0))));
+  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast<int>(ceil(log(static_cast<double>(t))/log(2.0))));
   if (exp_workgroups_per_feature < 0)
       exp_workgroups_per_feature = 0;
   if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature)
@@ -324,7 +324,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
     cudaPointerAttributes attributes;
     cudaPointerGetAttributes(&attributes, feature_masks_.data());
 
-    if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { 
+    if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
       CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data()));
     }
   }
@@ -343,7 +343,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
   // host_size histogram outputs
   //  host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_);
 
-  CUDASUCCESS_OR_FATAL(cudaHostAlloc( (void **)&host_histogram_outputs_, (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_),cudaHostAllocPortable));
+  CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast<void **>(&host_histogram_outputs_), (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable));
 
   // LGBM_CUDA
   nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_);
@@ -352,7 +352,6 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
 
 // LGBM_CUDA: allocate GPU memory for each GPU
 void CUDATreeLearner::AllocateGPUMemory() {
-
   #pragma omp parallel for schedule(static, num_gpu_)
 
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
@@ -392,7 +391,7 @@ void CUDATreeLearner::AllocateGPUMemory() {
       // copy indices to the device
 
      if (device_data_indices_[device_id] != NULL) {
-        CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); 
+       CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id]));
      }
 
       CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), (size_t) allocated_num_data_ * sizeof(data_size_t)));
@@ -404,17 +403,15 @@ void CUDATreeLearner::AllocateGPUMemory() {
       // each work group generates a sub-histogram of dword_features_ features.
 
       if (!device_subhistograms_[device_id]) {
-
-  // only initialize once here, as this will not need to change when ResetTrainingData() is called
+        // only initialize once here, as this will not need to change when ResetTrainingData() is called
         CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)));
 
         Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]);
-
       }
 
       // create atomic counters for inter-group coordination
       CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int))); 
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int)));
       CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int), stream_[device_id]));
 
       // The output buffer is allocated to host directly, to overlap compute and data transfer
@@ -425,7 +422,6 @@ void CUDATreeLearner::AllocateGPUMemory() {
 }
 
 void CUDATreeLearner::ResetGPUMemory() {
-
   // clear sparse/dense maps
   dense_feature_group_map_.clear();
   sparse_feature_group_map_.clear();
@@ -433,17 +429,16 @@ void CUDATreeLearner::ResetGPUMemory() {
 
 // LGBM_CUDA
 void CUDATreeLearner::copyDenseFeature() {
-
   if (num_feature_groups_ == 0) {
-      LGBM_config_::current_learner = use_cpu_learner;
-      return;
+    LGBM_config_::current_learner = use_cpu_learner;
+    return;
   }
 
 //  auto start_time = std::chrono::steady_clock::now();
   Log::Debug("Started copying dense features from CPU to GPU");
   // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
   size_t  copied_feature = 0;
-  // set device info 
+  // set device info
   int device_id = 0;
   uint8_t* device_features = device_features_[device_id];
   CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
@@ -466,7 +461,7 @@ void CUDATreeLearner::copyDenseFeature() {
          copied_feature = 0;
          if (device_id < num_gpu_) {
            device_features = device_features_[device_id];
-           CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); 
+           CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
          }
       }
     } else {
@@ -481,8 +476,7 @@ void CUDATreeLearner::copyDenseFeature() {
 
 
 // LGBM_CUDA: InitGPU w/ num_gpu
-void CUDATreeLearner::InitGPU(int num_gpu) { 
-
+void CUDATreeLearner::InitGPU(int num_gpu) {
   // Get the max bin size, used for selecting best GPU kernel
 
   max_num_bin_ = 0;
@@ -511,7 +505,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
     device_bin_size_ = 64;  // LGBM_CUDA
     histogram_size_ = 64;
     dword_features_ = 1;  // LGBM_CUDA
-  } else if ( max_num_bin_ <= 256) {
+  } else if (max_num_bin_ <= 256) {
     Log::Debug("device_bin_size_ = 256");
     device_bin_size_ = 256;
     histogram_size_ = 256;
@@ -530,7 +524,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   CountDenseFeatureGroups();
 
   if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_;
- 
+
   // LGBM_CUDA: initialize GPU
   int gpu_count;
 
@@ -538,9 +532,9 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu;
 
   // LGBM_CUDA: set cpu threads
-  cpu_threads_ = (pthread_t **)malloc(sizeof(pthread_t *)*num_gpu_);
+  cpu_threads_ = reinterpret_cast<pthread_t **>(malloc(sizeof(pthread_t *)*num_gpu_));
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    cpu_threads_[device_id] = (pthread_t *)malloc(sizeof(pthread_t)); 
+    cpu_threads_[device_id] = reinterpret_cast<pthread_t *>(malloc(sizeof(pthread_t)));
   }
 
   // LGBM_CUDA: resize device memory pointers
@@ -552,7 +546,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   sync_counters_.resize(num_gpu_);
   device_subhistograms_.resize(num_gpu_);
   device_histogram_outputs_.resize(num_gpu_);
- 
+
   // LGBM_CUDA: create stream & events to handle multiple GPUs
   preallocd_max_num_wg_.resize(num_gpu_, 1024);
   stream_.resize(num_gpu_);
@@ -567,7 +561,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   // for debuging
   kernel_time_.resize(num_gpu_, 0);
   kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
-  //kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
+  // kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
 
   for (int i = 0; i < num_gpu_; ++i) {
     CUDASUCCESS_OR_FATAL(cudaSetDevice(i));
@@ -596,7 +590,6 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
 
 Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians,
                             bool is_constant_hessian, const Json& forced_split_json) {
-
   // check if we need to recompile the GPU kernel (is_constant_hessian changed)
   // this should rarely occur
 
@@ -611,21 +604,20 @@ Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians,
 }
 
 void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
-
   // LGBM_CUDA: check data size
-  data_size_t old_allocated_num_data = allocated_num_data_;  
+  data_size_t old_allocated_num_data = allocated_num_data_;
 
   SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
 
-  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA 
-  serial_time = std::chrono::steady_clock::now() - start_serial_time;  
+  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
+  serial_time = std::chrono::steady_clock::now() - start_serial_time;
   #endif
 
   num_feature_groups_ = train_data_->num_feature_groups();
 
   // GPU memory has to been reallocated because data may have been changed
 
-  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA
+  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
   auto start_alloc_gpu_time = std::chrono::steady_clock::now();
   #endif
 
@@ -642,29 +634,28 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_
 
   copyDenseFeature();
 
-  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA
+  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
   alloc_gpu_time = std::chrono::steady_clock::now() - start_alloc_gpu_time;
   #endif
 
   // setup GPU kernel arguments after we allocating all the buffers
 
-  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA
+  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
   auto start_set_arg_time = std::chrono::steady_clock::now();
   #endif
 
-  #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA
+  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
   set_arg_time = std::chrono::steady_clock::now() - start_set_arg_time;
   reset_training_data_time = std::chrono::steady_clock::now() - start_reset_training_data_time;
   Log::Info("reset_training_data_time: %f secs.", reset_training_data_time.count() * 1e-3);
   Log::Info("serial_time: %f secs.", serial_time.count() * 1e-3);
   Log::Info("alloc_gpu_time: %f secs.", alloc_gpu_time.count() * 1e-3);
-  Log::Info("set_arg_time: %f secs.", set_arg_time.count() * 1e-3); 
+  Log::Info("set_arg_time: %f secs.", set_arg_time.count() * 1e-3);
   #endif
 }
 
 void CUDATreeLearner::BeforeTrain() {
-
-  #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA
+  #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
   std::chrono::duration<double, std::milli> device_hessians_time = std::chrono::milliseconds(0);
   std::chrono::duration<double, std::milli> device_gradients_time = std::chrono::milliseconds(0);
   #endif
@@ -674,13 +665,12 @@ void CUDATreeLearner::BeforeTrain() {
   #if GPU_DEBUG >= 2
   printf("CUDATreeLearner::BeforeTrain() Copying initial full gradients and hessians to device\n");
   #endif
-  
+
   // Copy initial full hessians and gradients to GPU.
   // We start copying as early as possible, instead of at ConstructHistogram().
 
   if ((hessians_ != NULL) && (gradients_ != NULL)) {
     if (!use_bagging_ && num_dense_feature_groups_) {
-
       Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_);
 
       for (int device_id = 0; device_id < num_gpu_; ++device_id) {
@@ -728,7 +718,6 @@ void CUDATreeLearner::BeforeTrain() {
   // use bagging
   if ((hessians_ != NULL) && (gradients_ != NULL)) {
     if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
-
       // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram()
       // copy used gradients and hessians to ordered buffer
 
@@ -737,18 +726,15 @@ void CUDATreeLearner::BeforeTrain() {
 
       // transfer the indices to GPU
       for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-
         CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id]));
         CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
 
         if (!is_constant_hessian_) {
-
-          CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], (void *) &(hessians_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+          CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], const_cast<void*>(reinterpret_cast<const void*>(&(hessians_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
           CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
-
         }
 
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], (void *) &(gradients_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
+        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], const_cast<void*>(reinterpret_cast<const void*>(&(gradients_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
         CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
       }
     }
@@ -756,7 +742,6 @@ void CUDATreeLearner::BeforeTrain() {
 }
 
 bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
-
   int smaller_leaf;
 
   data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
@@ -798,7 +783,6 @@ bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int r
 bool CUDATreeLearner::ConstructGPUHistogramsAsync(
   const std::vector<int8_t>& is_feature_used,
   const data_size_t* data_indices, data_size_t num_data) {
-
   if (num_data <= 0) {
     return false;
   }
@@ -808,35 +792,32 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
     Log::Debug("no dense feature groups, returning");
     return false;
   }
-  
+
   // copy data indices if it is not null
   if (data_indices != nullptr && num_data != num_data_) {
-
     for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-
       CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], data_indices, num_data * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
       CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
-
     }
   }
 
   // converted indices in is_feature_used to feature-group indices
   std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
 
-  #pragma omp parallel for schedule(static,1024) if (num_features_ >= 2048)
+  #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
   for (int i = 0; i < num_features_; ++i) {
-    if (is_feature_used[i]) { 
+    if (is_feature_used[i]) {
       int feature_group = train_data_->Feature2Group(i);  // LGBM_CUDA
-      is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group)<=16)? 2 : 1;  // LGBM_CUDA
+      is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group) <= 16) ? 2 : 1;  // LGBM_CUDA
     }
   }
 
   // construct the feature masks for dense feature-groups
   int used_dense_feature_groups = 0;
-  #pragma omp parallel for schedule(static,1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
+  #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (is_feature_group_used[dense_feature_group_map_[i]]) {
-      //feature_masks_[i] = 1;
+      // feature_masks_[i] = 1;
       feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]];
       ++used_dense_feature_groups;
     } else {
@@ -869,17 +850,15 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
 }
 
 void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-
   // LGBM_CUDA
-//  auto start_time = std::chrono::steady_clock::now();
+  // auto start_time = std::chrono::steady_clock::now();
 
   std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
   std::vector<int8_t> is_dense_feature_used(num_features_, 0);
-  int num_dense_features=0, num_sparse_features=0;
+  int num_dense_features = 0, num_sparse_features = 0;
 
   #pragma omp parallel for schedule(static)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
-
     if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
     if (!is_feature_used[feature_index]) continue;
     if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
@@ -892,7 +871,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   }
 
   // construct smaller leaf
-  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; 
+  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
 
   // Check workgroups per feature4 tuple..
   int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
@@ -931,7 +910,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
   // #define GPU_DEBUG_COMPARE
 #ifdef GPU_DEBUG_COMPARE
-  printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n",num_dense_feature_groups_);
+  printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n", num_dense_feature_groups_);
   bool compare = true;
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (!feature_masks_[i])
@@ -948,8 +927,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) {
       continue;
     }
-    if ( num_data == num_data_ ) {
-      if ( is_constant_hessian_ ) {
+    if (num_data == num_data_) {
+      if (is_constant_hessian_) {
         printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             0,
@@ -965,7 +944,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             current_histogram);
       }
     } else {
-      if ( is_constant_hessian_ ) {
+      if (is_constant_hessian_) {
         printf("ConstructHistogram(): is_constant_hessian_\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
@@ -973,7 +952,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             num_data,
             gradients_,
             current_histogram);
-      } else {  
+      } else {
         printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_);
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
@@ -984,7 +963,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       }
     }
     int retval;
-    if ( (num_data != num_data_) && compare ) {
+    if ((num_data != num_data_) && compare) {
         retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
         printf("CompareHistograms reports %d errors\n", retval);
         compare = false;
@@ -997,7 +976,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     }
     std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
     delete [] gpu_histogram;
-    //break;  // LGBM_CUDA: see only first feature info
+    // break;  // LGBM_CUDA: see only first feature info
   }
   printf("End Comparing Histogram between GPU and CPU\n");
   fflush(stderr);
@@ -1006,7 +985,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 #endif
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
-
     // construct larger leaf
 
     hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
@@ -1018,7 +996,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     // We set data_indices to null to avoid rebuilding ordered gradients/hessians
 
     if (num_sparse_features > 0) {
-    //train_data_->ConstructHistograms(is_sparse_feature_used,
+    // train_data_->ConstructHistograms(is_sparse_feature_used,
     //  nullptr, larger_leaf_splits_->num_data_in_leaf(),
     //  larger_leaf_splits_->leaf_index(),
     //  ordered_bins_, gradients_, hessians_,
@@ -1047,7 +1025,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 }
 
 void CUDATreeLearner::FindBestSplits() {
-
   SerialTreeLearner::FindBestSplits();
 
 #if GPU_DEBUG >= 3
@@ -1058,7 +1035,7 @@ void CUDATreeLearner::FindBestSplits() {
       smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
       continue;
     }
-    size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; 
+    size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1;
     printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd smaller leaf:\n", feature_index, bin_size);
     PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
     if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; }
@@ -1093,4 +1070,4 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ
 
 }   // namespace LightGBM
 #undef cudaMemcpy_DEBUG
-#endif // USE_CUDA
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 0375239049d..cd7413d3a43 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -1,3 +1,7 @@
+/*!
+ * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
 #pragma once
 #ifndef LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
 #define LGBM_TREELEARNET_CUDA_TREE_LEARNER_H_
@@ -27,7 +31,7 @@
 #include "cuda_kernel_launcher.h"  // LGBM_CUDA
 
 
-using namespace json11;
+using json11::Json;
 
 namespace LightGBM {
 
@@ -63,6 +67,7 @@ class CUDATreeLearner: public SerialTreeLearner {
     void FindBestSplits() override;
     void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
     void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
+
  private:
     /*! \brief 4-byte feature tuple used by GPU kernels */
     // struct Feature4 {
@@ -76,7 +81,7 @@ class CUDATreeLearner: public SerialTreeLearner {
      * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
      */
     int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
-  
+
     /*!
      * \brief Initialize GPU device
      * \LGBM_CUDA: param num_gpu: number of maximum gpus
@@ -107,7 +112,7 @@ class CUDATreeLearner: public SerialTreeLearner {
      * \param use_all_features Set to true to not use feature masks, with a faster kernel
      */
     void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
-  
+
     void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
                 int leaf_num_data, bool use_all_features,
                 int num_workgroups, int exp_workgroups_per_feature) {
@@ -130,7 +135,7 @@ class CUDATreeLearner: public SerialTreeLearner {
       td->sync_counters         = sync_counters_[device_id];
       td->device_histogram_outputs   = device_histogram_outputs_[device_id];
       td->exp_workgroups_per_feature = exp_workgroups_per_feature;
-    
+
       td->kernel_start           = &(kernel_start_[device_id]);
       td->kernel_wait_obj        = &(kernel_wait_obj_[device_id]);
       td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]);
@@ -138,14 +143,14 @@ class CUDATreeLearner: public SerialTreeLearner {
       size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
       size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
       td->output_size           = output_size;
-      td->host_histogram_output = (char*)host_histogram_outputs_ + host_output_offset;
+      td->host_histogram_output = reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset;
       td->histograms_wait_obj   = &(histograms_wait_obj_[device_id]);
     }
 
     // LGBM_CUDA: thread work
     // typedef void * (*THREADFUNCPTR)(void *);
     // void* launch_gpu_kernel(void *td);
- 
+
     /*!
      * \brief Wait for GPU kernel execution and read histogram
      * \param histograms Destination of histogram results from GPU.
@@ -285,13 +290,14 @@ class CUDATreeLearner: public SerialTreeLearner {
     int allocated_num_data_;  // allocated data instances
     pthread_t **cpu_threads_;  // pthread, 1 cpu thread / gpu
 };
+
 }  // namespace LightGBM
-#else // USE_CUDA
+#else  // USE_CUDA
 
 // When GPU support is not compiled in, quit with an error message
 
 namespace LightGBM {
-    
+
 class CUDATreeLearner: public SerialTreeLearner {
  public:
     #pragma warning(disable : 4702)
@@ -301,7 +307,7 @@ class CUDATreeLearner: public SerialTreeLearner {
     }
 };
 
-}
+}  // namespace LightGBM
 
-#endif // USE_CUDA
-#endif // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
+#endif  // USE_CUDA
+#endif  // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 7831159160b..64b2405a592 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -21,8 +21,7 @@ if (b == gtid && t == ltid) { \
 }
 
 // atomic add for float number in local memory
-inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val)
-{
+inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) {
     atomicAdd(addr, static_cast<acc_type>(val));
 }
 
@@ -31,13 +30,13 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val)
 #ifdef IGNORE_INDICES
 #define KERNEL_NAME histogram16_fulldata
 #else  // IGNORE_INDICES
-#define KERNEL_NAME histogram16 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
-//#define KERNEL_NAME histogram16_allfeats
-#endif // IGNORE_INDICES
-#else // ENABLE_ALL_FEATURES
+#define KERNEL_NAME histogram16  // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
+// #define KERNEL_NAME histogram16_allfeats
+#endif  // IGNORE_INDICES
+#else  // ENABLE_ALL_FEATURES
 #error "ENABLE_ALL_FEATURES should always be 1"
 #define KERNEL_NAME histogram16
-#endif // ENABLE_ALL_FEATURES
+#endif  // ENABLE_ALL_FEATURES
 #define NUM_BINS 16
 #define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
 
@@ -51,10 +50,10 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
     const ushort ltid = threadIdx.x;
-    // TODO: try to avoid bank conflict here
+    // TODO(anyone): try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
+    uint* __restrict__ local_cnt = reinterpret_cast<uint *>(local_hist + 2 * NUM_BINS);
 
     uint cont_bin;
     if (power_feature_workgroups != 0) {
@@ -74,7 +73,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
         }
 
         // skip the counters we already have
-        p += 3 * NUM_BINS;  
+        p += 3 * NUM_BINS;
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
@@ -94,12 +93,12 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
 }
 
 #if USE_CONSTANT_BUF == 1
-__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, 
+__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
                       __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
                       const data_size_t feature_size,
-                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
-                      const data_size_t num_data, 
-                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
+                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))),
+                      const data_size_t num_data,
+                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))),
 #if CONST_HESSIAN == 0
                       __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
 #else
@@ -110,269 +109,266 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
                       acc_type* __restrict__ hist_buf_base,
                       const size_t power_feature_workgroups) {
 #else
-__global__ void KERNEL_NAME(const uchar* feature_data_base, 
+__global__ void KERNEL_NAME(const uchar* feature_data_base,
                       // FIXME: how to handle this __constant
                       const uchar* __restrict__ feature_masks,
                       const data_size_t feature_size,
-                      const data_size_t* data_indices, 
-                      const data_size_t num_data, 
-                      const score_t*  ordered_gradients, 
+                      const data_size_t* data_indices,
+                      const data_size_t num_data,
+                      const score_t*  ordered_gradients,
 #if CONST_HESSIAN == 0
                       const score_t*  ordered_hessians,
 #else
                       const score_t const_hessian,
 #endif
-                      char* __restrict__ output_buf, 
+                      char* __restrict__ output_buf,
                       volatile int * sync_counters,
                       acc_type* __restrict__ hist_buf_base,
                       const size_t power_feature_workgroups) {
 #endif
-     // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
-     // otherwise a "Misaligned Address" exception may occur
-     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-     const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
-     const ushort ltid = threadIdx.x;
-     const ushort lsize = NUM_BINS;  // get_local_size(0);
-     const ushort group_id = blockIdx.x;
-
-     // local memory per workgroup is 3 KB
-     // clear local memory
-     uint *ptr = (uint *) shared_array;
-     for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
-         ptr[i] = 0;
-     }
-     __syncthreads();
-     // gradient/hessian histograms
-     // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
-     // total size: 2 * 256 * size_of(float) = 2 KB
-     // organization: each feature/grad/hessian is at a different bank,
-     //               as indepedent of the feature value as possible
-     acc_type *gh_hist = (acc_type *)shared_array;
-
-     // counter histogram
-     // total size: 256 * size_of(uint) = 1 KB
-     uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
-
-     // odd threads (1, 3, ...) compute histograms for hessians first
-     // even thread (0, 2, ...) compute histograms for gradients first
-     // etc.
-     uchar is_hessian_first = ltid & 1;
-
-     ushort feature_id = group_id >> power_feature_workgroups;
-
-     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
-     // feature_size is the number of examples per feature
-     const uchar *feature_data = feature_data_base + feature_id * feature_size;
-
-     // size of threads that process this feature4
-     const uint subglobal_size = lsize * (1 << power_feature_workgroups);
-
-     // equavalent thread ID in this subgroup for this feature4
-     const uint subglobal_tid  = gtid - feature_id * subglobal_size;
-
-
-     data_size_t ind;
-     data_size_t ind_next;
-     #ifdef IGNORE_INDICES
-     ind = subglobal_tid;
-     #else
-     ind = data_indices[subglobal_tid];
-     #endif
-
-     // extract feature mask, when a byte is set to 0, that feature is disabled
-     uchar feature_mask = feature_masks[feature_id];
-     // exit if the feature is masked
-     if (!feature_mask) {
-         return;
-     } else {
-         feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
-     }
-
-     // STAGE 1: read feature data, and gradient and hessian
-     // first half of the threads read feature data from global memory
-     // We will prefetch data into the "next" variable at the beginning of each iteration
-     uchar feature;
-     uchar feature_next;
-     //uint8_t bin;
-     ushort bin;
-
-     feature = feature_data[ind >> feature_mask];
-     if (feature_mask) {
-        feature = (feature >> ((ind & 1) << 2)) & 0xf;
-     }
-     bin = feature;
-     acc_type grad_bin = 0.0f, hess_bin = 0.0f;
-     acc_type *addr_bin;
-
-     // store gradient and hessian
-     score_t grad, hess;
-     score_t grad_next, hess_next;
-     // LGBM_CUDA v5.1
-     grad = ordered_gradients[ind];
-     #if CONST_HESSIAN == 0
-     hess = ordered_hessians[ind];
-     #endif
-
-
-     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-     for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
-         // prefetch the next iteration variables
-         // we don't need bondary check because we have made the buffer large
-         int i_next = i + subglobal_size;
-         #ifdef IGNORE_INDICES
-         // we need to check to bounds here
-         ind_next = i_next < num_data ? i_next : i;
-         #else
-         ind_next = data_indices[i_next];
-         #endif
-
-         // imbGBT v5.1
-         grad_next = ordered_gradients[ind_next];
-         #if CONST_HESSIAN == 0
-         hess_next = ordered_hessians[ind_next];
-         #endif
-
-         // STAGE 2: accumulate gradient and hessian
-         if (bin != feature) {
-             addr_bin = gh_hist + bin * 2 + is_hessian_first;
-             #if CONST_HESSIAN == 0
-             acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
-             atomic_local_add_f(addr_bin, acc_bin);
-
-             addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-             acc_bin = is_hessian_first? grad_bin : hess_bin;             
-             atomic_local_add_f(addr_bin, acc_bin);
-
-             #elif CONST_HESSIAN == 1
-             atomic_local_add_f(addr_bin, grad_bin);
-             #endif
-
-             bin = feature;
-             grad_bin = grad;
-             hess_bin = hess;
-         }
-         else {
-             grad_bin += grad;
-             hess_bin += hess;
-         }
-
-         // prefetch the next iteration variables
-         feature_next = feature_data[ind_next >> feature_mask];
-
-         // STAGE 3: accumulate counter
-         atomicAdd(cnt_hist + feature, 1);
-
-         // STAGE 4: update next stat
-         grad = grad_next;
-         hess = hess_next;
-         // LGBM_CUDA: v4.2
-         if (!feature_mask) {
-             feature = feature_next;
-         } else {
-             feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
-         }
-     }
-
-
-     addr_bin = gh_hist + bin * 2 + is_hessian_first;
-     #if CONST_HESSIAN == 0
-     acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
-     atomic_local_add_f(addr_bin, acc_bin);
-
-     addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-     acc_bin = is_hessian_first? grad_bin : hess_bin;
-     atomic_local_add_f(addr_bin, acc_bin);
-
-     #elif CONST_HESSIAN == 1
-     atomic_local_add_f(addr_bin, grad_bin);
-     #endif
-     __syncthreads();
+    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
+    // otherwise a "Misaligned Address" exception may occur
+    __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
+    const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
+    const ushort ltid = threadIdx.x;
+    const ushort lsize = NUM_BINS;  // get_local_size(0);
+    const ushort group_id = blockIdx.x;
+
+    // local memory per workgroup is 3 KB
+    // clear local memory
+    uint *ptr = reinterpret_cast<uint *>(shared_array);
+    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+        ptr[i] = 0;
+    }
+    __syncthreads();
+    // gradient/hessian histograms
+    // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
+    // total size: 2 * 256 * size_of(float) = 2 KB
+    // organization: each feature/grad/hessian is at a different bank,
+    //               as indepedent of the feature value as possible
+    acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
+
+    // counter histogram
+    // total size: 256 * size_of(uint) = 1 KB
+    uint *cnt_hist = reinterpret_cast<uint *>(gh_hist + 2 * NUM_BINS);
+
+    // odd threads (1, 3, ...) compute histograms for hessians first
+    // even thread (0, 2, ...) compute histograms for gradients first
+    // etc.
+    uchar is_hessian_first = ltid & 1;
+
+    ushort feature_id = group_id >> power_feature_workgroups;
+
+    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
+    // feature_size is the number of examples per feature
+    const uchar *feature_data = feature_data_base + feature_id * feature_size;
+
+    // size of threads that process this feature4
+    const uint subglobal_size = lsize * (1 << power_feature_workgroups);
+
+    // equavalent thread ID in this subgroup for this feature4
+    const uint subglobal_tid  = gtid - feature_id * subglobal_size;
+
+
+    data_size_t ind;
+    data_size_t ind_next;
+    #ifdef IGNORE_INDICES
+    ind = subglobal_tid;
+    #else
+    ind = data_indices[subglobal_tid];
+    #endif
+
+    // extract feature mask, when a byte is set to 0, that feature is disabled
+    uchar feature_mask = feature_masks[feature_id];
+    // exit if the feature is masked
+    if (!feature_mask) {
+        return;
+    } else {
+        feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+    }
 
-     #if CONST_HESSIAN == 1
-     // make a final reduction
-     gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position 
-     __syncthreads();
-     #endif
+    // STAGE 1: read feature data, and gradient and hessian
+    // first half of the threads read feature data from global memory
+    // We will prefetch data into the "next" variable at the beginning of each iteration
+    uchar feature;
+    uchar feature_next;
+    // uint8_t bin;
+    ushort bin;
+
+    feature = feature_data[ind >> feature_mask];
+    if (feature_mask) {
+       feature = (feature >> ((ind & 1) << 2)) & 0xf;
+    }
+    bin = feature;
+    acc_type grad_bin = 0.0f, hess_bin = 0.0f;
+    acc_type *addr_bin;
+
+    // store gradient and hessian
+    score_t grad, hess;
+    score_t grad_next, hess_next;
+    // LGBM_CUDA v5.1
+    grad = ordered_gradients[ind];
+    #if CONST_HESSIAN == 0
+    hess = ordered_hessians[ind];
+    #endif
+
+    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
+    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+        // prefetch the next iteration variables
+        // we don't need bondary check because we have made the buffer large
+        int i_next = i + subglobal_size;
+        #ifdef IGNORE_INDICES
+        // we need to check to bounds here
+        ind_next = i_next < num_data ? i_next : i;
+        #else
+        ind_next = data_indices[i_next];
+        #endif
+
+        // imbGBT v5.1
+        grad_next = ordered_gradients[ind_next];
+        #if CONST_HESSIAN == 0
+        hess_next = ordered_hessians[ind_next];
+        #endif
+
+        // STAGE 2: accumulate gradient and hessian
+        if (bin != feature) {
+            addr_bin = gh_hist + bin * 2 + is_hessian_first;
+            #if CONST_HESSIAN == 0
+            acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
+            atomic_local_add_f(addr_bin, acc_bin);
+
+            addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+            acc_bin = is_hessian_first ? grad_bin : hess_bin;
+            atomic_local_add_f(addr_bin, acc_bin);
+
+            #elif CONST_HESSIAN == 1
+            atomic_local_add_f(addr_bin, grad_bin);
+            #endif
+
+            bin = feature;
+            grad_bin = grad;
+            hess_bin = hess;
+        } else {
+            grad_bin += grad;
+            hess_bin += hess;
+        }
+
+        // prefetch the next iteration variables
+        feature_next = feature_data[ind_next >> feature_mask];
+
+        // STAGE 3: accumulate counter
+        atomicAdd(cnt_hist + feature, 1);
+
+        // STAGE 4: update next stat
+        grad = grad_next;
+        hess = hess_next;
+        // LGBM_CUDA: v4.2
+        if (!feature_mask) {
+            feature = feature_next;
+        } else {
+            feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
+        }
+    }
+
+
+    addr_bin = gh_hist + bin * 2 + is_hessian_first;
+    #if CONST_HESSIAN == 0
+    acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
+    atomic_local_add_f(addr_bin, acc_bin);
+
+    addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+    acc_bin = is_hessian_first ? grad_bin : hess_bin;
+    atomic_local_add_f(addr_bin, acc_bin);
+
+    #elif CONST_HESSIAN == 1
+    atomic_local_add_f(addr_bin, grad_bin);
+    #endif
+    __syncthreads();
+
+    #if CONST_HESSIAN == 1
+    // make a final reduction
+    gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
+    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position
+    __syncthreads();
+    #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
-     // write gradients and hessians
-     acc_type *__restrict__ ptr_f = output;
-     for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
-         // even threads read gradients, odd threads read hessians
-         // FIXME: 2-way bank conflict
-         acc_type value = gh_hist[i];
-         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
-     }
-     // write counts
-     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
-     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-         // FIXME: 2-way bank conflict
-         uint value = cnt_hist[i];
-         ptr_i[i] = value;
-     }
-     // FIXME: is this right
-     __syncthreads();
-     __threadfence();
-     // To avoid the cost of an extra reducting kernel, we have to deal with some
-     // gray area in OpenCL. We want the last work group that process this feature to
-     // make the final reduction, and other threads will just quit.
-     // This requires that the results written by other workgroups available to the
-     // last workgroup (memory consistency)
-     #if NVIDIA == 1
-     // this is equavalent to CUDA __threadfence();
-     // ensure the writes above goes to main memory and other workgroups can see it
-     asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
-     #else
-     // FIXME: how to do the above on AMD GPUs??
-     // GCN ISA says that the all writes will bypass L1 cache (write through),
-     // however when the last thread is reading sub-histogram data we have to
-     // make sure that no part of data is modified in local L1 cache of other workgroups.
-     // Otherwise reading can be a problem (atomic operations to get consistency).
-     // But in our case, the sub-histogram of this workgroup cannot be in the cache
-     // of another workgroup, so the following trick will work just fine.
-     #endif
-     // Now, we want one workgroup to do the final reduction.
-     // Other workgroups processing the same feature quit.
-     // The is done by using an global atomic counter.
-     // On AMD GPUs ideally this should be done in GDS,
-     // but currently there is no easy way to access it via OpenCL.
-     uint * counter_val = cnt_hist;     
-     // backup the old value
-     uint old_val = *counter_val;
-     if (ltid == 0) {
-         // all workgroups processing the same feature add this counter
-         *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
-     }
-     // make sure everyone in this workgroup is here
-     __syncthreads();
-     // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
-     if (*counter_val == (1 << power_feature_workgroups) - 1) {
-         if (ltid == 0) {
-             sync_counters[feature_id] = 0;
-         }
-     //}
- #else
-     }
-     // only 1 work group, no need to increase counter
-     // the reduction will become a simple copy
-     if (1) {
-         uint old_val;  // dummy
- #endif
-         // locate our feature's block in output memory
-         uint output_offset = (feature_id << power_feature_workgroups);
-         acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
-         // skip reading the data already in local memory
-         //uint skip_id = feature_id ^ output_offset;
-         uint skip_id = group_id - output_offset;
-         // locate output histogram location for this feature4
-         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-
-         
-         within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
-     }
+    acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
+    // write gradients and hessians
+    acc_type *__restrict__ ptr_f = output;
+    for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+        // even threads read gradients, odd threads read hessians
+        // FIXME: 2-way bank conflict
+        acc_type value = gh_hist[i];
+        ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
+    }
+    // write counts
+    acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
+    for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+        // FIXME: 2-way bank conflict
+        uint value = cnt_hist[i];
+        ptr_i[i] = value;
+    }
+    // FIXME: is this right
+    __syncthreads();
+    __threadfence();
+    // To avoid the cost of an extra reducting kernel, we have to deal with some
+    // gray area in OpenCL. We want the last work group that process this feature to
+    // make the final reduction, and other threads will just quit.
+    // This requires that the results written by other workgroups available to the
+    // last workgroup (memory consistency)
+    #if NVIDIA == 1
+    // this is equavalent to CUDA __threadfence();
+    // ensure the writes above goes to main memory and other workgroups can see it
+    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
+    #else
+    // FIXME: how to do the above on AMD GPUs??
+    // GCN ISA says that the all writes will bypass L1 cache (write through),
+    // however when the last thread is reading sub-histogram data we have to
+    // make sure that no part of data is modified in local L1 cache of other workgroups.
+    // Otherwise reading can be a problem (atomic operations to get consistency).
+    // But in our case, the sub-histogram of this workgroup cannot be in the cache
+    // of another workgroup, so the following trick will work just fine.
+    #endif
+    // Now, we want one workgroup to do the final reduction.
+    // Other workgroups processing the same feature quit.
+    // The is done by using an global atomic counter.
+    // On AMD GPUs ideally this should be done in GDS,
+    // but currently there is no easy way to access it via OpenCL.
+    uint * counter_val = cnt_hist;
+    // backup the old value
+    uint old_val = *counter_val;
+    if (ltid == 0) {
+        // all workgroups processing the same feature add this counter
+        *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
+    }
+    // make sure everyone in this workgroup is here
+    __syncthreads();
+    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+    if (*counter_val == (1 << power_feature_workgroups) - 1) {
+        if (ltid == 0) {
+            sync_counters[feature_id] = 0;
+        }
+    // }
+#else
+    }
+    // only 1 work group, no need to increase counter
+    // the reduction will become a simple copy
+    if (1) {
+        uint old_val;  // dummy
+#endif
+        // locate our feature's block in output memory
+        uint output_offset = (feature_id << power_feature_workgroups);
+        acc_type const * __restrict__ feature_subhists =
+                 reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
+        // skip reading the data already in local memory
+        // uint skip_id = feature_id ^ output_offset;
+        uint skip_id = group_id - output_offset;
+        // locate output histogram location for this feature4
+        acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
+
+        within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast<acc_type *>(shared_array), power_feature_workgroups);
+    }
 }
 
 // end of histogram16 stuff
@@ -385,13 +381,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 #ifdef IGNORE_INDICES
 #define KERNEL_NAME histogram64_fulldata
 #else  // IGNORE_INDICES
-#define KERNEL_NAME histogram64 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
-//#define KERNEL_NAME histogram64_allfeats
-#endif // IGNORE_INDICES
-#else // ENABLE_ALL_FEATURES
+#define KERNEL_NAME histogram64  // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
+// #define KERNEL_NAME histogram64_allfeats
+#endif  // IGNORE_INDICES
+#else  // ENABLE_ALL_FEATURES
 #error "ENABLE_ALL_FEATURES should always be 1"
 #define KERNEL_NAME histogram64
-#endif // ENABLE_ALL_FEATURES
+#endif  // ENABLE_ALL_FEATURES
 #define NUM_BINS 64
 #define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
 
@@ -405,10 +401,10 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
     const ushort ltid = threadIdx.x;
-    // TODO: try to avoid bank conflict here
+    // TODO(anyone): try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
+    uint* __restrict__ local_cnt = reinterpret_cast<uint *>(local_hist + 2 * NUM_BINS);
 
     uint cont_bin;
     if (power_feature_workgroups != 0) {
@@ -428,7 +424,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
         }
 
         // skip the counters we already have
-        p += 3 * NUM_BINS;  
+        p += 3 * NUM_BINS;
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
@@ -448,12 +444,12 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
 }
 
 #if USE_CONSTANT_BUF == 1
-__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, 
+__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
                       __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
                       const data_size_t feature_size,
-                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
-                      const data_size_t num_data, 
-                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
+                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))),
+                      const data_size_t num_data,
+                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))),
 #if CONST_HESSIAN == 0
                       __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
 #else
@@ -464,269 +460,264 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
                       acc_type* __restrict__ hist_buf_base,
                       const size_t power_feature_workgroups) {
 #else
-__global__ void KERNEL_NAME(const uchar* feature_data_base, 
+__global__ void KERNEL_NAME(const uchar* feature_data_base,
                       // FIXME: how to handle this __constant
                       const uchar* __restrict__ feature_masks,
                       const data_size_t feature_size,
-                      const data_size_t* data_indices, 
-                      const data_size_t num_data, 
-                      const score_t*  ordered_gradients, 
+                      const data_size_t* data_indices,
+                      const data_size_t num_data,
+                      const score_t*  ordered_gradients,
 #if CONST_HESSIAN == 0
                       const score_t*  ordered_hessians,
 #else
                       const score_t const_hessian,
 #endif
-                      char* __restrict__ output_buf, 
+                      char* __restrict__ output_buf,
                       volatile int * sync_counters,
                       acc_type* __restrict__ hist_buf_base,
                       const size_t power_feature_workgroups) {
 #endif
-     // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
-     // otherwise a "Misaligned Address" exception may occur
-     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-     const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
-     const ushort ltid = threadIdx.x;
-     const ushort lsize = NUM_BINS;  // get_local_size(0);
-     const ushort group_id = blockIdx.x;
-
-     // local memory per workgroup is 3 KB
-     // clear local memory
-     uint *ptr = (uint *) shared_array;
-     for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
-         ptr[i] = 0;
-     }
-     __syncthreads();
-     // gradient/hessian histograms
-     // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
-     // total size: 2 * 256 * size_of(float) = 2 KB
-     // organization: each feature/grad/hessian is at a different bank,
-     //               as indepedent of the feature value as possible
-     acc_type *gh_hist = (acc_type *)shared_array;
-
-     // counter histogram
-     // total size: 256 * size_of(uint) = 1 KB
-     uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
-
-     // odd threads (1, 3, ...) compute histograms for hessians first
-     // even thread (0, 2, ...) compute histograms for gradients first
-     // etc.
-     uchar is_hessian_first = ltid & 1;
-
-     ushort feature_id = group_id >> power_feature_workgroups;
-
-     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
-     // feature_size is the number of examples per feature
-     const uchar *feature_data = feature_data_base + feature_id * feature_size;
-
-     // size of threads that process this feature4
-     const uint subglobal_size = lsize * (1 << power_feature_workgroups);
-
-     // equavalent thread ID in this subgroup for this feature4
-     const uint subglobal_tid  = gtid - feature_id * subglobal_size;
-
-
-     data_size_t ind;
-     data_size_t ind_next;
-     #ifdef IGNORE_INDICES
-     ind = subglobal_tid;
-     #else
-     ind = data_indices[subglobal_tid];
-     #endif
-
-     // extract feature mask, when a byte is set to 0, that feature is disabled
-     uchar feature_mask = feature_masks[feature_id];
-     // exit if the feature is masked
-     if (!feature_mask) {
-         return;
-     } else {
-         feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
-     }
-
-     // STAGE 1: read feature data, and gradient and hessian
-     // first half of the threads read feature data from global memory
-     // We will prefetch data into the "next" variable at the beginning of each iteration
-     uchar feature;
-     uchar feature_next;
-     //uint8_t bin;
-     ushort bin;
-
-     feature = feature_data[ind >> feature_mask];
-     if (feature_mask) {
-        feature = (feature >> ((ind & 1) << 2)) & 0xf;
-     }
-     bin = feature;
-     acc_type grad_bin = 0.0f, hess_bin = 0.0f;
-     acc_type *addr_bin;
-
-     // store gradient and hessian
-     score_t grad, hess;
-     score_t grad_next, hess_next;
-     // LGBM_CUDA v5.1
-     grad = ordered_gradients[ind];
-     #if CONST_HESSIAN == 0
-     hess = ordered_hessians[ind];
-     #endif
-
-
-     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-     for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
-         // prefetch the next iteration variables
-         // we don't need bondary check because we have made the buffer large
-         int i_next = i + subglobal_size;
-         #ifdef IGNORE_INDICES
-         // we need to check to bounds here
-         ind_next = i_next < num_data ? i_next : i;
-         #else
-         ind_next = data_indices[i_next];
-         #endif
-
-         // imbGBT v5.1
-         grad_next = ordered_gradients[ind_next];
-         #if CONST_HESSIAN == 0
-         hess_next = ordered_hessians[ind_next];
-         #endif
-
-         // STAGE 2: accumulate gradient and hessian
-         if (bin != feature) {
-             addr_bin = gh_hist + bin * 2 + is_hessian_first;
-             #if CONST_HESSIAN == 0
-             acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
-             atomic_local_add_f(addr_bin, acc_bin);
-
-             addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-             acc_bin = is_hessian_first? grad_bin : hess_bin;             
-             atomic_local_add_f(addr_bin, acc_bin);
-
-             #elif CONST_HESSIAN == 1
-             atomic_local_add_f(addr_bin, grad_bin);
-             #endif
-
-             bin = feature;
-             grad_bin = grad;
-             hess_bin = hess;
-         }
-         else {
-             grad_bin += grad;
-             hess_bin += hess;
-         }
-
-         // prefetch the next iteration variables
-         feature_next = feature_data[ind_next >> feature_mask];
-
-         // STAGE 3: accumulate counter
-         atomicAdd(cnt_hist + feature, 1);
-
-         // STAGE 4: update next stat
-         grad = grad_next;
-         hess = hess_next;
-         // LGBM_CUDA: v4.2
-         if (!feature_mask) {
-             feature = feature_next;
-         } else {
-             feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
-         }
-     }
-
-
-     addr_bin = gh_hist + bin * 2 + is_hessian_first;
-     #if CONST_HESSIAN == 0
-     acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
-     atomic_local_add_f(addr_bin, acc_bin);
-
-     addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-     acc_bin = is_hessian_first? grad_bin : hess_bin;
-     atomic_local_add_f(addr_bin, acc_bin);
-
-     #elif CONST_HESSIAN == 1
-     atomic_local_add_f(addr_bin, grad_bin);
-     #endif
-     __syncthreads();
+    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
+    // otherwise a "Misaligned Address" exception may occur
+    __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
+    const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
+    const ushort ltid = threadIdx.x;
+    const ushort lsize = NUM_BINS;  // get_local_size(0);
+    const ushort group_id = blockIdx.x;
+
+    // local memory per workgroup is 3 KB
+    // clear local memory
+    uint *ptr = reinterpret_cast<uint *>(shared_array);
+    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+        ptr[i] = 0;
+    }
+    __syncthreads();
+    // gradient/hessian histograms
+    // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
+    // total size: 2 * 256 * size_of(float) = 2 KB
+    // organization: each feature/grad/hessian is at a different bank,
+    //               as indepedent of the feature value as possible
+    acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
+
+    // counter histogram
+    // total size: 256 * size_of(uint) = 1 KB
+    uint *cnt_hist = reinterpret_cast<uint *>(gh_hist + 2 * NUM_BINS);
+
+    // odd threads (1, 3, ...) compute histograms for hessians first
+    // even thread (0, 2, ...) compute histograms for gradients first
+    // etc.
+    uchar is_hessian_first = ltid & 1;
+
+    ushort feature_id = group_id >> power_feature_workgroups;
+
+    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
+    // feature_size is the number of examples per feature
+    const uchar *feature_data = feature_data_base + feature_id * feature_size;
+
+    // size of threads that process this feature4
+    const uint subglobal_size = lsize * (1 << power_feature_workgroups);
+
+    // equavalent thread ID in this subgroup for this feature4
+    const uint subglobal_tid  = gtid - feature_id * subglobal_size;
+
+    data_size_t ind;
+    data_size_t ind_next;
+    #ifdef IGNORE_INDICES
+    ind = subglobal_tid;
+    #else
+    ind = data_indices[subglobal_tid];
+    #endif
+
+    // extract feature mask, when a byte is set to 0, that feature is disabled
+    uchar feature_mask = feature_masks[feature_id];
+    // exit if the feature is masked
+    if (!feature_mask) {
+        return;
+    } else {
+        feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+    }
 
-     #if CONST_HESSIAN == 1
-     // make a final reduction
-     gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position 
-     __syncthreads();
-     #endif
+    // STAGE 1: read feature data, and gradient and hessian
+    // first half of the threads read feature data from global memory
+    // We will prefetch data into the "next" variable at the beginning of each iteration
+    uchar feature;
+    uchar feature_next;
+    // uint8_t bin;
+    ushort bin;
+
+    feature = feature_data[ind >> feature_mask];
+    if (feature_mask) {
+       feature = (feature >> ((ind & 1) << 2)) & 0xf;
+    }
+    bin = feature;
+    acc_type grad_bin = 0.0f, hess_bin = 0.0f;
+    acc_type *addr_bin;
+
+    // store gradient and hessian
+    score_t grad, hess;
+    score_t grad_next, hess_next;
+    // LGBM_CUDA v5.1
+    grad = ordered_gradients[ind];
+    #if CONST_HESSIAN == 0
+    hess = ordered_hessians[ind];
+    #endif
+
+    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
+    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+        // prefetch the next iteration variables
+        // we don't need bondary check because we have made the buffer large
+        int i_next = i + subglobal_size;
+        #ifdef IGNORE_INDICES
+        // we need to check to bounds here
+        ind_next = i_next < num_data ? i_next : i;
+        #else
+        ind_next = data_indices[i_next];
+        #endif
+
+        // imbGBT v5.1
+        grad_next = ordered_gradients[ind_next];
+        #if CONST_HESSIAN == 0
+        hess_next = ordered_hessians[ind_next];
+        #endif
+
+        // STAGE 2: accumulate gradient and hessian
+        if (bin != feature) {
+            addr_bin = gh_hist + bin * 2 + is_hessian_first;
+            #if CONST_HESSIAN == 0
+            acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
+            atomic_local_add_f(addr_bin, acc_bin);
+
+            addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+            acc_bin = is_hessian_first ? grad_bin : hess_bin;
+            atomic_local_add_f(addr_bin, acc_bin);
+
+            #elif CONST_HESSIAN == 1
+            atomic_local_add_f(addr_bin, grad_bin);
+            #endif
+
+            bin = feature;
+            grad_bin = grad;
+            hess_bin = hess;
+        } else {
+            grad_bin += grad;
+            hess_bin += hess;
+        }
+
+        // prefetch the next iteration variables
+        feature_next = feature_data[ind_next >> feature_mask];
+
+        // STAGE 3: accumulate counter
+        atomicAdd(cnt_hist + feature, 1);
+
+        // STAGE 4: update next stat
+        grad = grad_next;
+        hess = hess_next;
+        // LGBM_CUDA: v4.2
+        if (!feature_mask) {
+            feature = feature_next;
+        } else {
+            feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
+        }
+    }
+
+    addr_bin = gh_hist + bin * 2 + is_hessian_first;
+    #if CONST_HESSIAN == 0
+    acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
+    atomic_local_add_f(addr_bin, acc_bin);
+
+    addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+    acc_bin = is_hessian_first ? grad_bin : hess_bin;
+    atomic_local_add_f(addr_bin, acc_bin);
+
+    #elif CONST_HESSIAN == 1
+    atomic_local_add_f(addr_bin, grad_bin);
+    #endif
+    __syncthreads();
+
+    #if CONST_HESSIAN == 1
+    // make a final reduction
+    gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
+    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position
+    __syncthreads();
+    #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
-     // write gradients and hessians
-     acc_type *__restrict__ ptr_f = output;
-     for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
-         // even threads read gradients, odd threads read hessians
-         // FIXME: 2-way bank conflict
-         acc_type value = gh_hist[i];
-         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
-     }
-     // write counts
-     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
-     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-         // FIXME: 2-way bank conflict
-         uint value = cnt_hist[i];
-         ptr_i[i] = value;
-     }
-     // FIXME: is this right
-     __syncthreads();
-     __threadfence();
-     // To avoid the cost of an extra reducting kernel, we have to deal with some
-     // gray area in OpenCL. We want the last work group that process this feature to
-     // make the final reduction, and other threads will just quit.
-     // This requires that the results written by other workgroups available to the
-     // last workgroup (memory consistency)
-     #if NVIDIA == 1
-     // this is equavalent to CUDA __threadfence();
-     // ensure the writes above goes to main memory and other workgroups can see it
-     asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
-     #else
-     // FIXME: how to do the above on AMD GPUs??
-     // GCN ISA says that the all writes will bypass L1 cache (write through),
-     // however when the last thread is reading sub-histogram data we have to
-     // make sure that no part of data is modified in local L1 cache of other workgroups.
-     // Otherwise reading can be a problem (atomic operations to get consistency).
-     // But in our case, the sub-histogram of this workgroup cannot be in the cache
-     // of another workgroup, so the following trick will work just fine.
-     #endif
-     // Now, we want one workgroup to do the final reduction.
-     // Other workgroups processing the same feature quit.
-     // The is done by using an global atomic counter.
-     // On AMD GPUs ideally this should be done in GDS,
-     // but currently there is no easy way to access it via OpenCL.
-     uint * counter_val = cnt_hist;     
-     // backup the old value
-     uint old_val = *counter_val;
-     if (ltid == 0) {
-         // all workgroups processing the same feature add this counter
-         *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
-     }
-     // make sure everyone in this workgroup is here
-     __syncthreads();
-     // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
-     if (*counter_val == (1 << power_feature_workgroups) - 1) {
-         if (ltid == 0) {
-             sync_counters[feature_id] = 0;
-         }
-     //}
- #else
-     }
-     // only 1 work group, no need to increase counter
-     // the reduction will become a simple copy
-     if (1) {
-         uint old_val;  // dummy
- #endif
-         // locate our feature's block in output memory
-         uint output_offset = (feature_id << power_feature_workgroups);
-         acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
-         // skip reading the data already in local memory
-         //uint skip_id = feature_id ^ output_offset;
-         uint skip_id = group_id - output_offset;
-         // locate output histogram location for this feature4
-         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-
-         
-         within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
-     }
+    acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
+    // write gradients and hessians
+    acc_type *__restrict__ ptr_f = output;
+    for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+        // even threads read gradients, odd threads read hessians
+        // FIXME: 2-way bank conflict
+        acc_type value = gh_hist[i];
+        ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
+    }
+    // write counts
+    acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
+    for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+        // FIXME: 2-way bank conflict
+        uint value = cnt_hist[i];
+        ptr_i[i] = value;
+    }
+    // FIXME: is this right
+    __syncthreads();
+    __threadfence();
+    // To avoid the cost of an extra reducting kernel, we have to deal with some
+    // gray area in OpenCL. We want the last work group that process this feature to
+    // make the final reduction, and other threads will just quit.
+    // This requires that the results written by other workgroups available to the
+    // last workgroup (memory consistency)
+    #if NVIDIA == 1
+    // this is equavalent to CUDA __threadfence();
+    // ensure the writes above goes to main memory and other workgroups can see it
+    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
+    #else
+    // FIXME: how to do the above on AMD GPUs??
+    // GCN ISA says that the all writes will bypass L1 cache (write through),
+    // however when the last thread is reading sub-histogram data we have to
+    // make sure that no part of data is modified in local L1 cache of other workgroups.
+    // Otherwise reading can be a problem (atomic operations to get consistency).
+    // But in our case, the sub-histogram of this workgroup cannot be in the cache
+    // of another workgroup, so the following trick will work just fine.
+    #endif
+    // Now, we want one workgroup to do the final reduction.
+    // Other workgroups processing the same feature quit.
+    // The is done by using an global atomic counter.
+    // On AMD GPUs ideally this should be done in GDS,
+    // but currently there is no easy way to access it via OpenCL.
+    uint * counter_val = cnt_hist;
+    // backup the old value
+    uint old_val = *counter_val;
+    if (ltid == 0) {
+        // all workgroups processing the same feature add this counter
+        *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
+    }
+    // make sure everyone in this workgroup is here
+    __syncthreads();
+    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+    if (*counter_val == (1 << power_feature_workgroups) - 1) {
+        if (ltid == 0) {
+            sync_counters[feature_id] = 0;
+        }
+    // }
+#else
+    }
+    // only 1 work group, no need to increase counter
+    // the reduction will become a simple copy
+    if (1) {
+        uint old_val;  // dummy
+#endif
+        // locate our feature's block in output memory
+        uint output_offset = (feature_id << power_feature_workgroups);
+        acc_type const * __restrict__ feature_subhists =
+                 reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
+        // skip reading the data already in local memory
+        // uint skip_id = feature_id ^ output_offset;
+        uint skip_id = group_id - output_offset;
+        // locate output histogram location for this feature4
+        acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
+
+        within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast<acc_type *>(shared_array), power_feature_workgroups);
+    }
 }
 
 // end of histogram64 stuff
@@ -739,13 +730,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 #ifdef IGNORE_INDICES
 #define KERNEL_NAME histogram256_fulldata
 #else  // IGNORE_INDICES
-#define KERNEL_NAME histogram256 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
-//#define KERNEL_NAME histogram256_allfeats
-#endif // IGNORE_INDICES
-#else // ENABLE_ALL_FEATURES
+#define KERNEL_NAME histogram256  // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
+// #define KERNEL_NAME histogram256_allfeats
+#endif  // IGNORE_INDICES
+#else  // ENABLE_ALL_FEATURES
 #error "ENABLE_ALL_FEATURES should always be 1"
 #define KERNEL_NAME histogram256
-#endif // ENABLE_ALL_FEATURES
+#endif  // ENABLE_ALL_FEATURES
 #define NUM_BINS 256
 #define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
 
@@ -759,11 +750,10 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
     const ushort ltid = threadIdx.x;
-
-    // TODO: try to avoid bank conflict here
+    // TODO(anyone): try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS);
+    uint* __restrict__ local_cnt = reinterpret_cast<uint *>(local_hist + 2 * NUM_BINS);
 
     uint cont_bin;
     if (power_feature_workgroups != 0) {
@@ -783,7 +773,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
         }
 
         // skip the counters we already have
-        p += 3 * NUM_BINS;  
+        p += 3 * NUM_BINS;
 
         for (i = i + 1; i < num_sub_hist; ++i) {
             grad_bin += *p;          p += NUM_BINS;
@@ -803,12 +793,12 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
 }
 
 #if USE_CONSTANT_BUF == 1
-__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, 
+__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
                       __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
                       const data_size_t feature_size,
-                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
-                      const data_size_t num_data, 
-                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
+                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))),
+                      const data_size_t num_data,
+                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))),
 #if CONST_HESSIAN == 0
                       __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
 #else
@@ -819,264 +809,264 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
                       acc_type* __restrict__ hist_buf_base,
                       const size_t power_feature_workgroups) {
 #else
-__global__ void KERNEL_NAME(const uchar* feature_data_base, 
+__global__ void KERNEL_NAME(const uchar* feature_data_base,
                       // FIXME: how to handle this __constant
                       const uchar* __restrict__ feature_masks,
                       const data_size_t feature_size,
-                      const data_size_t* data_indices, 
-                      const data_size_t num_data, 
-                      const score_t*  ordered_gradients, 
+                      const data_size_t* data_indices,
+                      const data_size_t num_data,
+                      const score_t*  ordered_gradients,
 #if CONST_HESSIAN == 0
                       const score_t*  ordered_hessians,
 #else
                       const score_t const_hessian,
 #endif
-                      char* __restrict__ output_buf, 
+                      char* __restrict__ output_buf,
                       volatile int * sync_counters,
                       acc_type* __restrict__ hist_buf_base,
                       const size_t power_feature_workgroups) {
 #endif
-     // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
-     // otherwise a "Misaligned Address" exception may occur
-     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-     const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
-     const ushort ltid = threadIdx.x;
-     const ushort lsize = NUM_BINS;  // get_local_size(0);
-     const ushort group_id = blockIdx.x;
-
-     // local memory per workgroup is 3 KB
-     // clear local memory
-     uint *ptr = (uint *) shared_array;
-     for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
-         ptr[i] = 0;
-     }
-     __syncthreads();
-     // gradient/hessian histograms
-     // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
-     // total size: 2 * 256 * size_of(float) = 2 KB
-     // organization: each feature/grad/hessian is at a different bank,
-     //               as indepedent of the feature value as possible
-     acc_type *gh_hist = (acc_type *)shared_array;
-
-     // counter histogram
-     // total size: 256 * size_of(uint) = 1 KB
-     uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS);
-
-     // odd threads (1, 3, ...) compute histograms for hessians first
-     // even thread (0, 2, ...) compute histograms for gradients first
-     // etc.
-     uchar is_hessian_first = ltid & 1;
-
-     ushort feature_id = group_id >> power_feature_workgroups;
-
-     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
-     // feature_size is the number of examples per feature
-     const uchar *feature_data = feature_data_base + feature_id * feature_size;
-
-     // size of threads that process this feature4
-     const uint subglobal_size = lsize * (1 << power_feature_workgroups);
-
-     // equavalent thread ID in this subgroup for this feature4
-     const uint subglobal_tid  = gtid - feature_id * subglobal_size;
-
-     data_size_t ind;
-     data_size_t ind_next;
-     #ifdef IGNORE_INDICES
-     ind = subglobal_tid;
-     #else
-     ind = data_indices[subglobal_tid];
-     #endif
-
-     // extract feature mask, when a byte is set to 0, that feature is disabled
-     uchar feature_mask = feature_masks[feature_id];
-     // exit if the feature is masked
-     if (!feature_mask) {
-         return;
-     } else {
-         feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
-     }
-
-     // STAGE 1: read feature data, and gradient and hessian
-     // first half of the threads read feature data from global memory
-     // We will prefetch data into the "next" variable at the beginning of each iteration
-     uchar feature;
-     uchar feature_next;
-     //uint8_t bin;
-     ushort bin;
-
-     feature = feature_data[ind >> feature_mask];
-             if (feature_mask) {
-        feature = (feature >> ((ind & 1) << 2)) & 0xf;
-     }
-     bin = feature;
-     acc_type grad_bin = 0.0f, hess_bin = 0.0f;
-     acc_type *addr_bin;
-
-     // store gradient and hessian
-     score_t grad, hess;
-     score_t grad_next, hess_next;
-     // LGBM_CUDA v5.1
-     grad = ordered_gradients[ind];
-     #if CONST_HESSIAN == 0
-     hess = ordered_hessians[ind];
-     #endif
-
-     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-     for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
-         // prefetch the next iteration variables
-         // we don't need bondary check because we have made the buffer large
-         int i_next = i + subglobal_size;
-         #ifdef IGNORE_INDICES
-         // we need to check to bounds here
-         ind_next = i_next < num_data ? i_next : i;
-         #else
-         ind_next = data_indices[i_next];
-         #endif
-
-         // imbGBT v5.1
-         grad_next = ordered_gradients[ind_next];
-         #if CONST_HESSIAN == 0
-         hess_next = ordered_hessians[ind_next];
-         #endif
-         // STAGE 2: accumulate gradient and hessian
-         if (bin != feature) {
-             addr_bin = gh_hist + bin * 2 + is_hessian_first;
-             #if CONST_HESSIAN == 0
-             acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
-             atomic_local_add_f(addr_bin, acc_bin);
-
-             addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-             acc_bin = is_hessian_first? grad_bin : hess_bin;             
-             atomic_local_add_f(addr_bin, acc_bin);
-
-             #elif CONST_HESSIAN == 1
-             atomic_local_add_f(addr_bin, grad_bin);
-             #endif
-
-             bin = feature;
-             grad_bin = grad;
-             hess_bin = hess;
-         }
-         else {
-             grad_bin += grad;
-             hess_bin += hess;
-         }
-
-         // prefetch the next iteration variables
-         feature_next = feature_data[ind_next >> feature_mask];
-
-         // STAGE 3: accumulate counter
-         atomicAdd(cnt_hist + feature, 1);
-
-         // STAGE 4: update next stat
-         grad = grad_next;
-         hess = hess_next;
-         // LGBM_CUDA: v4.2
-         if (!feature_mask) {
-             feature = feature_next;
-         } else {
-             feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
-         }
-     }
-
-     addr_bin = gh_hist + bin * 2 + is_hessian_first;
-     #if CONST_HESSIAN == 0
-     acc_type acc_bin = is_hessian_first? hess_bin : grad_bin;
-     atomic_local_add_f(addr_bin, acc_bin);
-
-     addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-     acc_bin = is_hessian_first? grad_bin : hess_bin;
-
-     atomic_local_add_f(addr_bin, acc_bin);
-
-     #elif CONST_HESSIAN == 1
-     atomic_local_add_f(addr_bin, grad_bin);
-     #endif
-     __syncthreads();
+    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
+    // otherwise a "Misaligned Address" exception may occur
+    __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
+    const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
+    const ushort ltid = threadIdx.x;
+    const ushort lsize = NUM_BINS;  // get_local_size(0);
+    const ushort group_id = blockIdx.x;
+
+    // local memory per workgroup is 3 KB
+    // clear local memory
+    uint *ptr = reinterpret_cast<uint *>(shared_array);
+    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+        ptr[i] = 0;
+    }
+    __syncthreads();
+    // gradient/hessian histograms
+    // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
+    // total size: 2 * 256 * size_of(float) = 2 KB
+    // organization: each feature/grad/hessian is at a different bank,
+    //               as indepedent of the feature value as possible
+    acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
+
+    // counter histogram
+    // total size: 256 * size_of(uint) = 1 KB
+    uint *cnt_hist = reinterpret_cast<uint *>(gh_hist + 2 * NUM_BINS);
+
+    // odd threads (1, 3, ...) compute histograms for hessians first
+    // even thread (0, 2, ...) compute histograms for gradients first
+    // etc.
+    uchar is_hessian_first = ltid & 1;
+
+    ushort feature_id = group_id >> power_feature_workgroups;
+
+    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
+    // feature_size is the number of examples per feature
+    const uchar *feature_data = feature_data_base + feature_id * feature_size;
+
+    // size of threads that process this feature4
+    const uint subglobal_size = lsize * (1 << power_feature_workgroups);
+
+    // equavalent thread ID in this subgroup for this feature4
+    const uint subglobal_tid  = gtid - feature_id * subglobal_size;
+
+    data_size_t ind;
+    data_size_t ind_next;
+    #ifdef IGNORE_INDICES
+    ind = subglobal_tid;
+    #else
+    ind = data_indices[subglobal_tid];
+    #endif
+
+    // extract feature mask, when a byte is set to 0, that feature is disabled
+    uchar feature_mask = feature_masks[feature_id];
+    // exit if the feature is masked
+    if (!feature_mask) {
+        return;
+    } else {
+        feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+    }
 
-     #if CONST_HESSIAN == 1
-     // make a final reduction
-     gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-     gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position 
+    // STAGE 1: read feature data, and gradient and hessian
+    // first half of the threads read feature data from global memory
+    // We will prefetch data into the "next" variable at the beginning of each iteration
+    uchar feature;
+    uchar feature_next;
+    // uint8_t bin;
+    ushort bin;
+
+    feature = feature_data[ind >> feature_mask];
+    if (feature_mask) {
+       feature = (feature >> ((ind & 1) << 2)) & 0xf;
+    }
+    bin = feature;
+    acc_type grad_bin = 0.0f, hess_bin = 0.0f;
+    acc_type *addr_bin;
+
+    // store gradient and hessian
+    score_t grad, hess;
+    score_t grad_next, hess_next;
+    // LGBM_CUDA v5.1
+    grad = ordered_gradients[ind];
+    #if CONST_HESSIAN == 0
+    hess = ordered_hessians[ind];
+    #endif
+
+    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
+    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+        // prefetch the next iteration variables
+        // we don't need bondary check because we have made the buffer large
+        int i_next = i + subglobal_size;
+        #ifdef IGNORE_INDICES
+        // we need to check to bounds here
+        ind_next = i_next < num_data ? i_next : i;
+        #else
+        ind_next = data_indices[i_next];
+        #endif
+
+        // imbGBT v5.1
+        grad_next = ordered_gradients[ind_next];
+        #if CONST_HESSIAN == 0
+        hess_next = ordered_hessians[ind_next];
+        #endif
+        // STAGE 2: accumulate gradient and hessian
+        if (bin != feature) {
+            addr_bin = gh_hist + bin * 2 + is_hessian_first;
+            #if CONST_HESSIAN == 0
+            acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
+            atomic_local_add_f(addr_bin, acc_bin);
+
+            addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+            acc_bin = is_hessian_first ? grad_bin : hess_bin;
+            atomic_local_add_f(addr_bin, acc_bin);
+
+            #elif CONST_HESSIAN == 1
+            atomic_local_add_f(addr_bin, grad_bin);
+            #endif
+
+            bin = feature;
+            grad_bin = grad;
+            hess_bin = hess;
+        } else {
+            grad_bin += grad;
+            hess_bin += hess;
+        }
+
+        // prefetch the next iteration variables
+        feature_next = feature_data[ind_next >> feature_mask];
+
+        // STAGE 3: accumulate counter
+        atomicAdd(cnt_hist + feature, 1);
+
+        // STAGE 4: update next stat
+        grad = grad_next;
+        hess = hess_next;
+        // LGBM_CUDA: v4.2
+        if (!feature_mask) {
+            feature = feature_next;
+        } else {
+            feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
+        }
+    }
+
+    addr_bin = gh_hist + bin * 2 + is_hessian_first;
+    #if CONST_HESSIAN == 0
+    acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
+    atomic_local_add_f(addr_bin, acc_bin);
+
+    addr_bin = addr_bin + 1 - 2 * is_hessian_first;
+    acc_bin = is_hessian_first ? grad_bin : hess_bin;
+
+    atomic_local_add_f(addr_bin, acc_bin);
+
+    #elif CONST_HESSIAN == 1
+    atomic_local_add_f(addr_bin, grad_bin);
+    #endif
      __syncthreads();
-     #endif
+
+    #if CONST_HESSIAN == 1
+    // make a final reduction
+    gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
+    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position
+    __syncthreads();
+    #endif
 
 #if POWER_FEATURE_WORKGROUPS != 0
-     acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS;
-     // write gradients and hessians
-     acc_type *__restrict__ ptr_f = output;
-     for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
-         // even threads read gradients, odd threads read hessians
-         // FIXME: 2-way bank conflict
-         acc_type value = gh_hist[i];
-         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
-     }
-     // write counts
-     acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS);
-     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-         // FIXME: 2-way bank conflict
-         uint value = cnt_hist[i];
-         ptr_i[i] = value;
-     }
-     // FIXME: is this right
-     __syncthreads();
-     __threadfence();
-     // To avoid the cost of an extra reducting kernel, we have to deal with some
-     // gray area in OpenCL. We want the last work group that process this feature to
-     // make the final reduction, and other threads will just quit.
-     // This requires that the results written by other workgroups available to the
-     // last workgroup (memory consistency)
-     #if NVIDIA == 1
-     // this is equavalent to CUDA __threadfence();
-     // ensure the writes above goes to main memory and other workgroups can see it
-     asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
-     #else
-     // FIXME: how to do the above on AMD GPUs??
-     // GCN ISA says that the all writes will bypass L1 cache (write through),
-     // however when the last thread is reading sub-histogram data we have to
-     // make sure that no part of data is modified in local L1 cache of other workgroups.
-     // Otherwise reading can be a problem (atomic operations to get consistency).
-     // But in our case, the sub-histogram of this workgroup cannot be in the cache
-     // of another workgroup, so the following trick will work just fine.
-     #endif
-     // Now, we want one workgroup to do the final reduction.
-     // Other workgroups processing the same feature quit.
-     // The is done by using an global atomic counter.
-     // On AMD GPUs ideally this should be done in GDS,
-     // but currently there is no easy way to access it via OpenCL.
-     uint * counter_val = cnt_hist;     
-     // backup the old value
-     uint old_val = *counter_val;
-     if (ltid == 0) {
-         // all workgroups processing the same feature add this counter
-         *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
-     }
-     // make sure everyone in this workgroup is here
-     __syncthreads();
-     // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
-     if (*counter_val == (1 << power_feature_workgroups) - 1) {
-         if (ltid == 0) {
-             sync_counters[feature_id] = 0;
-         }
-     //}
- #else
-     }
-     // only 1 work group, no need to increase counter
-     // the reduction will become a simple copy
-     if (1) {
-         uint old_val;  // dummy
- #endif
-         // locate our feature's block in output memory
-         uint output_offset = (feature_id << power_feature_workgroups);
-         acc_type const * __restrict__ feature_subhists =
-                  (acc_type *)output_buf + output_offset * 3 * NUM_BINS;
-         // skip reading the data already in local memory
-         //uint skip_id = feature_id ^ output_offset;
-         uint skip_id = group_id - output_offset;
-         // locate output histogram location for this feature4
-         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-         within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups);
-     }
+    acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
+    // write gradients and hessians
+    acc_type *__restrict__ ptr_f = output;
+    for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+        // even threads read gradients, odd threads read hessians
+        // FIXME: 2-way bank conflict
+        acc_type value = gh_hist[i];
+        ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
+    }
+    // write counts
+    acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
+    for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+        // FIXME: 2-way bank conflict
+        uint value = cnt_hist[i];
+        ptr_i[i] = value;
+    }
+    // FIXME: is this right
+    __syncthreads();
+    __threadfence();
+    // To avoid the cost of an extra reducting kernel, we have to deal with some
+    // gray area in OpenCL. We want the last work group that process this feature to
+    // make the final reduction, and other threads will just quit.
+    // This requires that the results written by other workgroups available to the
+    // last workgroup (memory consistency)
+    #if NVIDIA == 1
+    // this is equavalent to CUDA __threadfence();
+    // ensure the writes above goes to main memory and other workgroups can see it
+    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
+    #else
+    // FIXME: how to do the above on AMD GPUs??
+    // GCN ISA says that the all writes will bypass L1 cache (write through),
+    // however when the last thread is reading sub-histogram data we have to
+    // make sure that no part of data is modified in local L1 cache of other workgroups.
+    // Otherwise reading can be a problem (atomic operations to get consistency).
+    // But in our case, the sub-histogram of this workgroup cannot be in the cache
+    // of another workgroup, so the following trick will work just fine.
+    #endif
+    // Now, we want one workgroup to do the final reduction.
+    // Other workgroups processing the same feature quit.
+    // The is done by using an global atomic counter.
+    // On AMD GPUs ideally this should be done in GDS,
+    // but currently there is no easy way to access it via OpenCL.
+    uint * counter_val = cnt_hist;
+    // backup the old value
+    uint old_val = *counter_val;
+    if (ltid == 0) {
+        // all workgroups processing the same feature add this counter
+        *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
+    }
+    // make sure everyone in this workgroup is here
+    __syncthreads();
+    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+    if (*counter_val == (1 << power_feature_workgroups) - 1) {
+        if (ltid == 0) {
+            sync_counters[feature_id] = 0;
+        }
+    // }
+#else
+    }
+    // only 1 work group, no need to increase counter
+    // the reduction will become a simple copy
+    if (1) {
+        uint old_val;  // dummy
+#endif
+        // locate our feature's block in output memory
+        uint output_offset = (feature_id << power_feature_workgroups);
+        acc_type const * __restrict__ feature_subhists =
+                 reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
+        // skip reading the data already in local memory
+        // uint skip_id = feature_id ^ output_offset;
+        uint skip_id = group_id - output_offset;
+        // locate output histogram location for this feature4
+        acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
+
+        within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast<acc_type *>(shared_array), power_feature_workgroups);
+    }
 }
 
 // end of histogram256 stuff

From 312733d4b7b65578e8c8ee57e852496cfe1b73b8 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 17 Jun 2020 19:32:23 +0000
Subject: [PATCH 075/119] Minor cleanup so less differences in code.

---
 include/LightGBM/dataset.h              | 1 -
 src/boosting/gbdt.cpp                   | 2 --
 src/treelearner/data_partition.hpp      | 2 --
 src/treelearner/serial_tree_learner.cpp | 4 ++--
 4 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index e4c5dc56511..0fd0dfc6d15 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -441,7 +441,6 @@ class Dataset {
     return ret;
   }
 
-  /* LGBM_CUDA  void ReSize(data_size_t num_data); */
   // LGBM_CUDA ReSize() returns true if resized
   bool ReSize(data_size_t num_data);
 
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index b4c14a40e78..940fef25f87 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -273,8 +273,6 @@ void GBDT::Bagging(int iter) {
                              bag_data_cnt_, false);
       tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                     bag_data_cnt_);
-
-      tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_);
     }
   }
 }
diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp
index 01c5d2606e7..7a6ac031e62 100644
--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -164,8 +164,6 @@ class DataPartition {
   /*! \brief used data count, used for bagging */
   data_size_t used_data_count_;
   ParallelPartitionRunner<data_size_t, true> runner_;
-  // LGBM_CUDA
-  //  bool is_cuda_;
 };
 
 }  // namespace LightGBM
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 96882732a92..b7f0d47982c 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -181,7 +181,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
   int init_splits = 0;
   bool aborted_last_force_split = false;
   if (!forced_split_json.is_null()) {
-    init_splits = ForceSplits(tree.get(), forced_split_json, &left_leaf,
+    init_splits = ForceSplits(tree_prt, forced_split_json, &left_leaf,
                               &right_leaf, &cur_depth, &aborted_last_force_split);
   }
 
@@ -456,7 +456,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
   Json right;
   bool left_smaller = true;
   std::unordered_map<int, SplitInfo> forceSplitMap;
-  q.push(std::make_pair(forced_split_json, *left_leaf));
+  q.push(std::make_pair(left, *left_leaf));
   while (!q.empty()) {
     // before processing next node from queue, store info for current left/right leaf
     // store "best split" for left and right, even if they might be overwritten by forced split

From 943603acde14975a2b6db36dcd93d67c3a2206ed Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 23 Jun 2020 20:03:00 +0000
Subject: [PATCH 076/119] Revert is_use_subset changes

---
 include/LightGBM/tree_learner.h               |   3 +-
 src/boosting/gbdt.cpp                         | 180 ++----------------
 src/boosting/goss.hpp                         |  22 +--
 src/treelearner/cuda_tree_learner.cpp         |  16 +-
 src/treelearner/cuda_tree_learner.h           |  17 +-
 .../data_parallel_tree_learner.cpp            |   4 +-
 .../feature_parallel_tree_learner.cpp         |   6 +-
 src/treelearner/gpu_tree_learner.cpp          |   4 +-
 src/treelearner/gpu_tree_learner.h            |   2 +-
 .../kernels/histogram_16_64_256.cu            |  14 --
 src/treelearner/parallel_tree_learner.h       |   6 +-
 src/treelearner/serial_tree_learner.cpp       |   4 +-
 src/treelearner/serial_tree_learner.h         |   3 +-
 .../voting_parallel_tree_learner.cpp          |   4 +-
 14 files changed, 36 insertions(+), 249 deletions(-)

diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index 2ea30ac63b2..2493122e3cb 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -34,8 +34,7 @@ class TreeLearner {
   * \param train_data The used training data
   * \param is_constant_hessian True if all hessians share the same value
   */
-  // LGBM_CUDA is_use_subset_ for CUDA
-  virtual void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) = 0;
+  virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0;
 
   virtual void ResetIsConstantHessian(bool is_constant_hessian) = 0;
 
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 940fef25f87..39bbccabff3 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -88,6 +88,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
 
   is_constant_hessian_ = GetIsConstHessian(objective_function);
 
+  tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get()));
+
+  // init tree learner
+  tree_learner_->Init(train_data_, is_constant_hessian_);
+  tree_learner_->SetForcedSplit(&forced_splits_json_);
+
   // push training metrics
   training_metrics_.clear();
   for (const auto& metric : training_metrics) {
@@ -113,30 +119,9 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   feature_infos_ = train_data_->feature_infos();
   monotone_constraints_ = config->monotone_constraints;
 
-  // LGBM_CUDA
-  // Two key changes: position of the initializer is moved from the original code, and init() uses is_use_subset_ flag
-  tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get()));
-
   // if need bagging, create buffer
-  // LGBM_CUDA: for CUDA implementation, this function sets up the is_use_subset_ flag as TRUE and tmp_subset_ is allocated.
   ResetBaggingConfig(config_.get(), true);
 
-  // init tree learner
-  // LGBM_CUDA do not copy feature is is_use_subset for initialization
-  // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_)
-
-  if (config_->device_type == std::string("cuda")) {
-    if (is_use_subset_) {
-      tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_);
-    } else {
-      tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_);
-    }
-  } else {
-    tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_);
-  }
-
-  tree_learner_->SetForcedSplit(&forced_splits_json_);
-
   class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
   if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
     CHECK_EQ(num_tree_per_iteration_, num_class_);
@@ -259,18 +244,11 @@ void GBDT::Bagging(int iter) {
     // set bagging data to tree learner
     if (!is_use_subset_) {
       tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-    } else {  // LGBM_CUDA
-      // NEW get subset
-      bool resized = tmp_subset_->ReSize(bag_data_cnt_);
-
-     if (resized && (config_->device_type == std::string("cuda"))) {
-        size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-        tmp_gradients_.resize(total_size);
-        tmp_hessians_.resize(total_size);
-      }
-
+    } else {
+      // get subset
+      tmp_subset_->ReSize(bag_data_cnt_);
       tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
-                             bag_data_cnt_, false);
+                              bag_data_cnt_, false);
       tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                     bag_data_cnt_);
     }
@@ -280,18 +258,13 @@ void GBDT::Bagging(int iter) {
 void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
   Common::FunctionTimer fun_timer("GBDT::Train", global_timer);
   bool is_finished = false;
-
-  // LGBM_CUDA
   auto start_time = std::chrono::steady_clock::now();
-
   for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) {
     is_finished = TrainOneIter(nullptr, nullptr);
     if (!is_finished) {
       is_finished = EvalAndCheckEarlyStopping();
     }
-
     auto end_time = std::chrono::steady_clock::now();
-
     // output used time per iteration
     Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double,
               std::milli>(end_time - start_time) * 1e-3, iter + 1);
@@ -374,134 +347,7 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
   return 0.0f;
 }
 
-// LGBM_CUDA
-bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) {
-  // LGBM_CUDA invoke bagging during the first iteration
-  if (config_->device_type == std::string("cuda") && (iter_ == 0)) {
-    // auto start_time = std::chrono::steady_clock::now();
-
-    Bagging(iter_);
-  }
-
-  std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
-
-  // boosting first
-  if (gradients == nullptr || hessians == nullptr) {
-    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
-      init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true);
-    }
-
-    // LGBM_CUDA
-    // auto start_time = std::chrono::steady_clock::now();
-
-    Boosting();
-
-    gradients = gradients_.data();
-    hessians = hessians_.data();
-  }
-
-  // LGBM_CUDA  bagging logic
-  // Bagging(iter_);
-
-  bool should_continue = false;
-  for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
-    // LGBM_CUDA
-//    auto start_time = std::chrono::steady_clock::now();
-
-    const size_t offset = static_cast<size_t>(cur_tree_id) * num_data_;
-    std::unique_ptr<Tree> new_tree(new Tree(2));
-
-    if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) {
-      auto grad = gradients + offset;
-      auto hess = hessians + offset;
-
-      // LGBM_CUDA
-      if (((tmp_gradients_.data() == 0) || (tmp_hessians_.data() == 0)) && (config_->device_type == std::string("cuda"))) {
-        size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-        tmp_gradients_.resize(total_size);
-        tmp_hessians_.resize(total_size);
-      }
-
-      auto tmp_grad = tmp_gradients_.data();
-      auto tmp_hess = tmp_hessians_.data();
-
-      // need to copy gradients for bagging subset.
-      if (is_use_subset_ && bag_data_cnt_ < num_data_) {
-        #pragma omp parallel for schedule(static)  // LGBM_CUDA
-        for (int i = 0; i < bag_data_cnt_; ++i) {
-          tmp_grad[i] = grad[bag_data_indices_[i]];  // LGBM_CUDA
-          tmp_hess[i] = hess[bag_data_indices_[i]];  // LGBM_CUDA
-        }
-      }
-
-      // LGBM_CUDA
-      new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_));
-    }
-
-    if (new_tree->num_leaves() > 1) {
-      should_continue = true;
-      auto score_ptr = train_score_updater_->score() + offset;
-      auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast<double>(label[i]) - score_ptr[i]; };
-      tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter,
-                                     num_data_, bag_data_indices_.data(), bag_data_cnt_);
-      // shrinkage by learning rate
-      new_tree->Shrinkage(shrinkage_rate_);
-      // update score
-      UpdateScore(new_tree.get(), cur_tree_id);
-      if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) {
-        new_tree->AddBias(init_scores[cur_tree_id]);
-      }
-    } else {
-      // only add default score one-time
-      if (models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
-        double output = 0.0;
-        if (!class_need_train_[cur_tree_id]) {
-          if (objective_function_ != nullptr) {
-            output = objective_function_->BoostFromScore(cur_tree_id);
-          }
-        } else {
-          output = init_scores[cur_tree_id];
-        }
-        new_tree->AsConstantTree(output);
-        // updates scores
-        train_score_updater_->AddScore(output, cur_tree_id);
-        for (auto& score_updater : valid_score_updater_) {
-          score_updater->AddScore(output, cur_tree_id);
-        }
-      }
-
-    // LGBM_CUDA: moved for overlapping data copy w/ other operations
-    int iter_next = iter_ + 1;
-      if (iter_next < config_->num_iterations) {
-        // auto start_time = std::chrono::steady_clock::now();
-
-        // bagging logic
-        Bagging(iter_next);
-      }
-    }
-    // add model
-    models_.push_back(std::move(new_tree));
-  }
-
-  if (!should_continue) {
-    Log::Warning("Stopped training because there are no more leaves that meet the split requirements");
-    if (models_.size() > static_cast<size_t>(num_tree_per_iteration_)) {
-      for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
-        models_.pop_back();
-      }
-    }
-    return true;
-  }
-
-  ++iter_;
-  return false;
-}
-
 bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
-  if (config_->device_type == std::string("cuda")) {  // LGBM_CUDA
-     return TrainOneIterCUDA(gradients, hessians);
-  }
-
   Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer);
   std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
   // boosting first
@@ -929,12 +775,10 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
       bagging_rands_.emplace_back(config_->bagging_seed + i);
     }
 
-    double average_bag_rate =
-        (static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq;
     is_use_subset_ = false;
     const int group_threshold_usesubset = 100;
-    if (average_bag_rate <= 0.5
-        && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
+    double average_bag_rate = (static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq;
+    if (average_bag_rate <= 0.5 && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
       if (tmp_subset_ == nullptr || is_change_dataset) {
         tmp_subset_.reset(new Dataset(bag_data_cnt_));
         tmp_subset_->CopyFeatureMapperFrom(train_data_);
diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp
index 2af6dee14f6..cd512e243d8 100644
--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -131,27 +131,7 @@ class GOSS: public GBDT {
     bag_data_cnt_ = num_data_;
     // not subsample for first iterations
     if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; }
-    auto left_cnt = bagging_runner_.Run<true>(
-        num_data_,
-        [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
-            data_size_t*) {
-          data_size_t cur_left_count = 0;
-          cur_left_count = BaggingHelper(cur_start, cur_cnt, left);
-          return cur_left_count;
-        },
-        bag_data_indices_.data());
-    bag_data_cnt_ = left_cnt;
-    // set bagging data to tree learner
-    if (!is_use_subset_) {
-      tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-    } else {
-      // get subset
-      tmp_subset_->ReSize(bag_data_cnt_);
-      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
-                              bag_data_cnt_, false);
-      tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
-                                    bag_data_cnt_);
-    }
+    GBDT::Bagging(iter);
   }
 
  protected:
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 8d59e98f67f..5a9b9b73c4f 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -70,16 +70,13 @@ CUDATreeLearner::~CUDATreeLearner() {
 }
 
 
-void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
+void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
   // initialize SerialTreeLearner
-  SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset);
+  SerialTreeLearner::Init(train_data, is_constant_hessian);
 
   // some additional variables needed for GPU trainer
   num_feature_groups_ = train_data_->num_feature_groups();
 
-  // LGBM_CUDA: use subset of training data for bagging
-  is_use_subset_ = is_use_subset;
-
   // Initialize GPU buffers and kernels & LGBM_CUDA: get device info
   InitGPU(config_->num_gpu);  // LGBM_CUDA
 }
@@ -580,16 +577,11 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
 
   AllocateGPUMemory();
 
-  // LGBM_CUDA: copy dense feature data from cpu to gpu only when we use entire training data for training
-
-  if (!is_use_subset_) {
-    Log::Debug("copyDenseFeature at the initialization\n");
-    copyDenseFeature();  // LGBM_CUDA
-  }
+  copyDenseFeature(); // LGBM_CUDA
 }
 
 Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians,
-                            bool is_constant_hessian, const Json& forced_split_json) {
+                            bool is_constant_hessian, Json& forced_split_json) {
   // check if we need to recompile the GPU kernel (is_constant_hessian changed)
   // this should rarely occur
 
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index cd7413d3a43..6a2a0e06e52 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -39,19 +39,15 @@ namespace LightGBM {
 * \brief CUDA-based parallel learning algorithm.
 */
 class CUDATreeLearner: public SerialTreeLearner {
- public:
+public:
     explicit CUDATreeLearner(const Config* tree_config);
     ~CUDATreeLearner();
-    // LGBM_CUDA: is_use_subset is used by CUDA only
-    void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;
+    void Init(const Dataset* train_data, bool is_constant_hessian) override;
     void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
     Tree* Train(const score_t* gradients, const score_t *hessians,
-                bool is_constant_hessian, const Json& forced_split_json) override;
-
+                bool is_constant_hessian, Json& forced_split_json);
     void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
       SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
-      // determine if we are using bagging before we construct the data partition
-      // thus we can start data movement to GPU earlier
       if (subset == nullptr && used_indices != nullptr) {
         if (num_data != num_data_) {
           use_bagging_ = true;
@@ -147,10 +143,6 @@ class CUDATreeLearner: public SerialTreeLearner {
       td->histograms_wait_obj   = &(histograms_wait_obj_[device_id]);
     }
 
-    // LGBM_CUDA: thread work
-    // typedef void * (*THREADFUNCPTR)(void *);
-    // void* launch_gpu_kernel(void *td);
-
     /*!
      * \brief Wait for GPU kernel execution and read histogram
      * \param histograms Destination of histogram results from GPU.
@@ -276,9 +268,6 @@ class CUDATreeLearner: public SerialTreeLearner {
     // cudaEvent_t features_future_;
     std::vector<cudaEvent_t> features_future_;
 
-    // LGBM_CUDA: use subset of training data for bagging
-    bool is_use_subset_;
-
     // LGBM_CUDA: host-side buffer for converting feature data into featre4 data
     // std::vector<uint8_t*> host_vecs_;
     int nthreads_;  // number of Feature4* vector on host4_vecs_
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index 31425c77cd3..70e6d98354f 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -20,9 +20,9 @@ DataParallelTreeLearner<TREELEARNER_T>::~DataParallelTreeLearner() {
 }
 
 template <typename TREELEARNER_T>
-void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {  // LGBM_CUDA
+void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
   // initialize SerialTreeLearner
-  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset);  // LGBM_CUDA
+  TREELEARNER_T::Init(train_data, is_constant_hessian);
   // Get local rank and global machine size
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp
index 3dde7f0f39b..69809e6069c 100644
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -19,9 +19,9 @@ template <typename TREELEARNER_T>
 FeatureParallelTreeLearner<TREELEARNER_T>::~FeatureParallelTreeLearner() {
 }
 
-template <typename TREELEARNER_T>  // LGBM_CUDA
-void FeatureParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
-  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset);  // LGBM_CUDA
+template <typename TREELEARNER_T>
+void FeatureParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
+  TREELEARNER_T::Init(train_data, is_constant_hessian);
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
 
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index 7fb2a340a65..689314fd07a 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -36,9 +36,9 @@ GPUTreeLearner::~GPUTreeLearner() {
   }
 }
 
-void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
+void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
   // initialize SerialTreeLearner
-  SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset);
+  SerialTreeLearner::Init(train_data, is_constant_hessian);
   // some additional variables needed for GPU trainer
   num_feature_groups_ = train_data_->num_feature_groups();
   // Initialize GPU buffers and kernels
diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h
index 2ed29bcd1f7..8568b7de014 100644
--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -45,7 +45,7 @@ class GPUTreeLearner: public SerialTreeLearner {
  public:
   explicit GPUTreeLearner(const Config* tree_config);
   ~GPUTreeLearner();
-  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
   void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
   void ResetIsConstantHessian(bool is_constant_hessian) override;
   Tree* Train(const score_t* gradients, const score_t *hessians,
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 64b2405a592..23e9b150a1f 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -83,13 +83,8 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
     }
     __syncthreads();
 
-
     output_buf[ltid * 2 + 0] = grad_bin;
-#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
-#else
-    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
-#endif
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -434,13 +429,8 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
     }
     __syncthreads();
 
-
     output_buf[ltid * 2 + 0] = grad_bin;
-#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
-#else
-    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
-#endif
 }
 
 #if USE_CONSTANT_BUF == 1
@@ -785,11 +775,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     __syncthreads();
 
     output_buf[ltid * 2 + 0] = grad_bin;
-#if CONST_HESSIAN == 0
     output_buf[ltid * 2 + 1] = hess_bin;
-#else
-    output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin);
-#endif
 }
 
 #if USE_CONSTANT_BUF == 1
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 222955a3c94..4bb62d203a9 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -28,7 +28,7 @@ class FeatureParallelTreeLearner: public TREELEARNER_T {
  public:
   explicit FeatureParallelTreeLearner(const Config* config);
   ~FeatureParallelTreeLearner();
-  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;  // LGBM_CUDA
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
 
  protected:
   void BeforeTrain() override;
@@ -55,7 +55,7 @@ class DataParallelTreeLearner: public TREELEARNER_T {
  public:
   explicit DataParallelTreeLearner(const Config* config);
   ~DataParallelTreeLearner();
-  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;  // LGBM_CUDA
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
   void ResetConfig(const Config* config) override;
 
  protected:
@@ -109,7 +109,7 @@ class VotingParallelTreeLearner: public TREELEARNER_T {
  public:
   explicit VotingParallelTreeLearner(const Config* config);
   ~VotingParallelTreeLearner() { }
-  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;  // LGBM_CUDA
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
   void ResetConfig(const Config* config) override;
 
  protected:
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index b7f0d47982c..d7e09546d55 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -25,9 +25,7 @@ SerialTreeLearner::SerialTreeLearner(const Config* config)
 SerialTreeLearner::~SerialTreeLearner() {
 }
 
-// LGBM_CUDA
-void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {
-  (void)is_use_subset;  // UNUSED
+void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   num_features_ = train_data_->num_features();
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 668b54592e7..20f87bbf549 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -53,8 +53,7 @@ class SerialTreeLearner: public TreeLearner {
 
   ~SerialTreeLearner();
 
-  // LGBM_CUDA is_use_subset is used by CUDA only
-  void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
 
   void ResetTrainingData(const Dataset* train_data,
                          bool is_constant_hessian) override {
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 4b120975c26..265e94757aa 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -19,8 +19,8 @@ VotingParallelTreeLearner<TREELEARNER_T>::VotingParallelTreeLearner(const Config
 }
 
 template <typename TREELEARNER_T>
-void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) {  // LGBM_CUDA
-  TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset);  // LGBM_CUDA
+void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
+  TREELEARNER_T::Init(train_data, is_constant_hessian);
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
 

From 1842c826a804ca6f778deb50b1a1cef06e5ed27c Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 24 Jun 2020 20:01:27 +0000
Subject: [PATCH 077/119] Another rebase from master to fix recent conflicts.

---
 src/treelearner/cuda_tree_learner.cpp | 4 ++--
 src/treelearner/cuda_tree_learner.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 5a9b9b73c4f..cf55e35ca32 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -1016,8 +1016,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   }
 }
 
-void CUDATreeLearner::FindBestSplits() {
-  SerialTreeLearner::FindBestSplits();
+void CUDATreeLearner::FindBestSplits(const Tree* tree) {
+  SerialTreeLearner::FindBestSplits(tree);
 
 #if GPU_DEBUG >= 3
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 6a2a0e06e52..2d357228848 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -60,7 +60,7 @@ class CUDATreeLearner: public SerialTreeLearner {
  protected:
     void BeforeTrain() override;
     bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
-    void FindBestSplits() override;
+    void FindBestSplits(const Tree* tree) override;
     void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
     void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
 

From 7a796975f5788c73405f2db29bd1869fb1a56700 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 24 Jun 2020 20:11:15 +0000
Subject: [PATCH 078/119] More lint.

---
 src/treelearner/cuda_tree_learner.cpp | 4 ++--
 src/treelearner/cuda_tree_learner.h   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index cf55e35ca32..93b1e9e98ac 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -577,11 +577,11 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
 
   AllocateGPUMemory();
 
-  copyDenseFeature(); // LGBM_CUDA
+  copyDenseFeature();  // LGBM_CUDA
 }
 
 Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians,
-                            bool is_constant_hessian, Json& forced_split_json) {
+                            bool is_constant_hessian, const Json& forced_split_json) {
   // check if we need to recompile the GPU kernel (is_constant_hessian changed)
   // this should rarely occur
 
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 2d357228848..5e3d9e650e4 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -39,13 +39,13 @@ namespace LightGBM {
 * \brief CUDA-based parallel learning algorithm.
 */
 class CUDATreeLearner: public SerialTreeLearner {
-public:
+ public:
     explicit CUDATreeLearner(const Config* tree_config);
     ~CUDATreeLearner();
     void Init(const Dataset* train_data, bool is_constant_hessian) override;
     void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
     Tree* Train(const score_t* gradients, const score_t *hessians,
-                bool is_constant_hessian, Json& forced_split_json);
+                bool is_constant_hessian, const Json& forced_split_json);
     void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
       SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
       if (subset == nullptr && used_indices != nullptr) {

From f37ab3b174c77ac57b096bfeb203c12f049a1445 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 2 Jul 2020 14:20:51 +0000
Subject: [PATCH 079/119] Simple code cleanup - add & remove blank lines,
 revert unneccessary format changes, remove added dead code.

---
 include/LightGBM/bin.h                  |  1 -
 src/boosting/gbdt.cpp                   |  6 ++--
 src/io/config.cpp                       |  1 -
 src/io/config_auto.cpp                  |  5 ----
 src/io/dataset.cpp                      | 40 ++++++++-----------------
 src/treelearner/serial_tree_learner.cpp | 15 ++++------
 6 files changed, 21 insertions(+), 47 deletions(-)

diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 96ae6a8d641..e541e7039e9 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -308,7 +308,6 @@ class Bin {
   * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
   * \param out Output Result
   */
-
   virtual void ConstructHistogram(
     const data_size_t* data_indices, data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 39bbccabff3..c351240a266 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -775,10 +775,12 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
       bagging_rands_.emplace_back(config_->bagging_seed + i);
     }
 
+    double average_bag_rate =
+        (static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq;
     is_use_subset_ = false;
     const int group_threshold_usesubset = 100;
-    double average_bag_rate = (static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq;
-    if (average_bag_rate <= 0.5 && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
+    if (average_bag_rate <= 0.5
+        && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
       if (tmp_subset_ == nullptr || is_change_dataset) {
         tmp_subset_.reset(new Dataset(bag_data_cnt_));
         tmp_subset_->CopyFeatureMapperFrom(train_data_);
diff --git a/src/io/config.cpp b/src/io/config.cpp
index ed643204c91..7ee8a74487f 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -321,7 +321,6 @@ void Config::CheckParamConflict() {
       num_leaves = static_cast<int>(full_num_leaves);
     }
   }
-
   // force col-wise for gpu
   if (device_type == std::string("gpu")) {
     force_col_wise = true;
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index ba9c07cb547..6cd26c58c16 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -485,11 +485,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetBool(params, "is_enable_sparse", &is_enable_sparse);
 
-  /* Hard-code to address the sparse layout issue. ONLY FOR CUDA IMPLEMENTATION */
-#if 0
-  is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-#endif
-
   GetBool(params, "enable_bundle", &enable_bundle);
 
   GetBool(params, "use_missing", &use_missing);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 416d66695a3..054c17e0a01 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for
  * license information.
  */
-
 #include <LightGBM/dataset.h>
 
 #include <LightGBM/feature_group.h>
@@ -238,17 +237,12 @@ std::vector<std::vector<int>> FindGroups(
   return features_in_group;
 }
 
-std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
-                                                  int** sample_indices,
-                                                  double** sample_values,
-                                                  const int* num_per_col,
-                                                  int num_sample_col,
-                                                  data_size_t total_sample_cnt,
-                                                  const std::vector<int>& used_features,
-                                                  data_size_t num_data,
-                                                  bool is_sparse,
-                                                  std::vector<int8_t>* multi_val_group,
-                                                  bool is_use_gpu) {
+std::vector<std::vector<int>> FastFeatureBundling(
+    const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
+    int** sample_indices, double** sample_values, const int* num_per_col,
+    int num_sample_col, data_size_t total_sample_cnt,
+    const std::vector<int>& used_features, data_size_t num_data,
+    bool is_use_gpu, bool is_sparse, std::vector<int8_t>* multi_val_group) {
   Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
   std::vector<size_t> feature_non_zero_cnt;
   feature_non_zero_cnt.reserve(used_features.size());
@@ -355,17 +349,11 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
   std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
   if (io_config.enable_bundle && !used_features.empty()) {
     bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");  // LGBM_CUDA
-    features_in_group = FastFeatureBundling(*bin_mappers,
-                                            sample_non_zero_indices,
-                                            sample_values,
-                                            num_per_col,
-                                            num_sample_col,
-                                            static_cast<data_size_t>(total_sample_cnt),
-                                            used_features,
-                                            num_data_,
-                                            io_config.is_enable_sparse,
-                                            &group_is_multi_val,
-                                            lgbm_is_gpu_used);
+    features_in_group = FastFeatureBundling(
+        *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
+        num_sample_col, static_cast<data_size_t>(total_sample_cnt),
+        used_features, num_data_, lgbm_is_gpu_used,
+        io_config.is_enable_sparse, &group_is_multi_val);
   }
 
   num_features_ = 0;
@@ -804,7 +792,6 @@ void Dataset::CopySubrow(const Dataset* fullset,
   CHECK_EQ(num_used_indices, num_data_);
   OMP_INIT_EX();
 #pragma omp parallel for schedule(static)
-
   for (int group = 0; group < num_groups_; ++group) {
     OMP_LOOP_EX_BEGIN();
     feature_groups_[group]->CopySubrow(fullset->feature_groups_[group].get(),
@@ -1282,6 +1269,7 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
   }
   OMP_THROW_EX();
   global_timer.Stop("Dataset::sparse_bin_histogram");
+
   global_timer.Start("Dataset::sparse_bin_histogram_merge");
   int n_bin_block = 1;
   int bin_block_size = num_bin;
@@ -1311,12 +1299,10 @@ void Dataset::ConstructHistogramsInner(
     data_size_t num_data, const score_t* gradients, const score_t* hessians,
     score_t* ordered_gradients, score_t* ordered_hessians,
     TrainingShareStates* share_state, hist_t* hist_data) const {
-
   if (!share_state->is_colwise) {
     return ConstructHistogramsMultiVal<USE_INDICES, false>(
         data_indices, num_data, gradients, hessians, share_state, hist_data);
   }
-
   std::vector<int> used_dense_group;
   int multi_val_groud_id = -1;
   used_dense_group.reserve(num_groups_);
@@ -1338,7 +1324,6 @@ void Dataset::ConstructHistogramsInner(
       }
     }
   }
-
   int num_used_dense_group = static_cast<int>(used_dense_group.size());
   global_timer.Start("Dataset::dense_bin_histogram");
   auto ptr_ordered_grad = gradients;
@@ -1361,7 +1346,6 @@ void Dataset::ConstructHistogramsInner(
         ptr_ordered_grad = ordered_gradients;
       }
     }
-
     OMP_INIT_EX();
 #pragma omp parallel for schedule(static) num_threads(share_state->num_threads)
     for (int gi = 0; gi < num_used_dense_group; ++gi) {
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index d7e09546d55..3f4e192fb7c 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -185,7 +185,6 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
 
   for (int split = init_splits; split < config_->num_leaves - 1; ++split) {
     // some initial works before finding best split
-
     if (BeforeFindBestSplit(tree_prt, left_leaf, right_leaf)) {
       // find best threshold for every feature
       FindBestSplits(tree_prt);
@@ -350,7 +349,8 @@ void SerialTreeLearner::ConstructHistograms(
   Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
                                   global_timer);
   // construct smaller leaf
-  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
+  hist_t* ptr_smaller_leaf_hist_data =
+      smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
   train_data_->ConstructHistograms(
       is_feature_used, smaller_leaf_splits_->data_indices(),
       smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
@@ -359,7 +359,8 @@ void SerialTreeLearner::ConstructHistograms(
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
     // construct larger leaf
-    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
+    hist_t* ptr_larger_leaf_hist_data =
+        larger_leaf_histogram_array_[0].RawData() - kHistOffset;
     train_data_->ConstructHistograms(
         is_feature_used, larger_leaf_splits_->data_indices(),
         larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
@@ -388,7 +389,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
       continue;
     }
     const int tid = omp_get_thread_num();
-
     train_data_->FixHistogram(
         feature_index, smaller_leaf_splits_->sum_gradients(),
         smaller_leaf_splits_->sum_hessians(),
@@ -422,12 +422,12 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
                                larger_node_used_features[feature_index],
                                larger_leaf_splits_->num_data_in_leaf(),
                                larger_leaf_splits_.get(), &larger_best[tid]);
+
     OMP_LOOP_EX_END();
   }
   OMP_THROW_EX();
   auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
   int leaf = smaller_leaf_splits_->leaf_index();
-
   best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
 
   if (larger_leaf_splits_ != nullptr &&
@@ -557,9 +557,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
 void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
                                    int* right_leaf, bool update_cnt) {
   Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer);
-
   SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
-
   const int inner_feature_index =
       train_data_->InnerFeatureIndex(best_split_info.feature);
   if (cegb_ != nullptr) {
@@ -718,7 +716,6 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
     FeatureHistogram* histogram_array_, int feature_index, int real_fidx,
     bool is_feature_used, int num_data, const LeafSplits* leaf_splits,
     SplitInfo* best_split) {
-
   if (!is_feature_used) {
     return;
   }
@@ -733,11 +730,9 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
   } else {
     parent_output = leaf_splits->weight();
   }
-
   histogram_array_[feature_index].FindBestThreshold(
       leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
       constraints_->Get(leaf_splits->leaf_index()),  parent_output, &new_split);
-
   new_split.feature = real_fidx;
   if (cegb_ != nullptr) {
     new_split.gain -=

From 9ff6a2b4e53b74a6c92b7abc2b4a1efe94640f0f Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 6 Jul 2020 13:36:31 +0000
Subject: [PATCH 080/119] Removed parameters added for CUDA and various bug
 fix.

---
 include/LightGBM/tree_learner.h         |  3 +--
 src/boosting/gbdt.cpp                   |  3 +--
 src/boosting/gbdt.h                     | 10 +--------
 src/boosting/goss.hpp                   | 22 +++++++++++++++++-
 src/boosting/rf.hpp                     |  2 +-
 src/treelearner/cuda_tree_learner.cpp   | 30 +++++++++----------------
 src/treelearner/cuda_tree_learner.h     |  3 +--
 src/treelearner/gpu_tree_learner.cpp    |  5 ++---
 src/treelearner/gpu_tree_learner.h      |  3 +--
 src/treelearner/serial_tree_learner.cpp | 18 +++++----------
 src/treelearner/serial_tree_learner.h   |  8 +++----
 11 files changed, 47 insertions(+), 60 deletions(-)

diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index 2493122e3cb..2231f385c6e 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -56,7 +56,7 @@ class TreeLearner {
   * \param is_constant_hessian True if all hessians share the same value
   * \return A trained tree
   */
-  virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, const Json& forced_split_json) = 0;
+  virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0;
 
   /*!
   * \brief use an existing tree to fit the new gradients and hessians.
@@ -68,7 +68,6 @@ class TreeLearner {
 
   /*!
   * \brief Set bagging data
-  * \param subset subset of bagging
   * \param used_indices Used data indices
   * \param num_data Number of used data
   */
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index c351240a266..8cec5796127 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -378,8 +378,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
         grad = gradients_.data() + offset;
         hess = hessians_.data() + offset;
       }
-      // LGBM_CUDA
-      new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
+      new_tree.reset(tree_learner_->Train(grad, hess));
     }
 
     if (new_tree->num_leaves() > 1) {
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index d22b6687766..1b82efc0cbf 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -8,7 +8,6 @@
 #include <LightGBM/boosting.h>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/prediction_early_stop.h>
-#include <LightGBM/utils/json11.h>
 #include <LightGBM/utils/threading.h>
 
 #include <string>
@@ -22,6 +21,7 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/utils/json11.h>
 #include "score_updater.hpp"
 
 #ifdef USE_CUDA
@@ -148,14 +148,6 @@ class GBDT : public GBDTBase {
   */
   bool TrainOneIter(const score_t* gradients, const score_t* hessians) override;
 
-  /*!
-  * \brief Training logic
-  * \param gradients nullptr for using default objective, otherwise use self-defined boosting
-  * \param hessians nullptr for using default objective, otherwise use self-defined boosting
-  * \return True if cannot train any more
-  */
-  bool TrainOneIterCUDA(const score_t* gradients, const score_t* hessians);  // LGBM_CUDA
-
   /*!
   * \brief Rollback one iteration
   */
diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp
index cd512e243d8..2af6dee14f6 100644
--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -131,7 +131,27 @@ class GOSS: public GBDT {
     bag_data_cnt_ = num_data_;
     // not subsample for first iterations
     if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; }
-    GBDT::Bagging(iter);
+    auto left_cnt = bagging_runner_.Run<true>(
+        num_data_,
+        [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
+            data_size_t*) {
+          data_size_t cur_left_count = 0;
+          cur_left_count = BaggingHelper(cur_start, cur_cnt, left);
+          return cur_left_count;
+        },
+        bag_data_indices_.data());
+    bag_data_cnt_ = left_cnt;
+    // set bagging data to tree learner
+    if (!is_use_subset_) {
+      tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
+    } else {
+      // get subset
+      tmp_subset_->ReSize(bag_data_cnt_);
+      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
+                              bag_data_cnt_, false);
+      tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
+                                    bag_data_cnt_);
+    }
   }
 
  protected:
diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp
index e64bf6cb4d8..5c90202a515 100644
--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -125,7 +125,7 @@ class RF : public GBDT {
           hess = tmp_hess_.data();
         }
 
-        new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_));
+        new_tree.reset(tree_learner_->Train(grad, hess));
       }
 
       if (new_tree->num_leaves() > 1) {
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 93b1e9e98ac..97010367f81 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -252,14 +252,14 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra
     // LGBM_CUDA
   }
 
+  HistType* histograms = reinterpret_cast<HistType*>(leaf_histogram_array[0].RawData() - kHistOffset);
   #pragma omp parallel for schedule(static)
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (!feature_masks_[i]) {
       continue;
     }
     int dense_group_index = dense_feature_group_map_[i];
-    // auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
-    auto old_histogram_array = leaf_histogram_array[dense_group_index].RawData() - kHistOffset;
+    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
     int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
 
     for (int j = 0; j < bin_size; ++j) {
@@ -580,18 +580,8 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   copyDenseFeature();  // LGBM_CUDA
 }
 
-Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians,
-                            bool is_constant_hessian, const Json& forced_split_json) {
-  // check if we need to recompile the GPU kernel (is_constant_hessian changed)
-  // this should rarely occur
-
-  if (is_constant_hessian != is_constant_hessian_) {
-    Log::Debug("Recompiling GPU kernel because hessian is %sa constant now", is_constant_hessian ? "" : "not ");
-    is_constant_hessian_ = is_constant_hessian;
-  }
-
-  Tree *ret = SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json);
-
+Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians) {
+  Tree *ret = SerialTreeLearner::Train(gradients, hessians);
   return ret;
 }
 
@@ -666,7 +656,7 @@ void CUDATreeLearner::BeforeTrain() {
       Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_);
 
       for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-        if (!is_constant_hessian_) {
+        if (!(share_state_->is_constant_hessian)) {
           Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_");
 
           #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
@@ -721,7 +711,7 @@ void CUDATreeLearner::BeforeTrain() {
         CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id]));
         CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
 
-        if (!is_constant_hessian_) {
+        if (!(share_state_->is_constant_hessian)) {
           CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], const_cast<void*>(reinterpret_cast<const void*>(&(hessians_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
           CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
         }
@@ -920,7 +910,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       continue;
     }
     if (num_data == num_data_) {
-      if (is_constant_hessian_) {
+      if (share_state_->is_constant_hessian) {
         printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             0,
@@ -936,7 +926,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
             current_histogram);
       }
     } else {
-      if (is_constant_hessian_) {
+      if (share_state_->is_constant_hessian) {
         printf("ConstructHistogram(): is_constant_hessian_\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
@@ -956,11 +946,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     }
     int retval;
     if ((num_data != num_data_) && compare) {
-        retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
+        retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
         printf("CompareHistograms reports %d errors\n", retval);
         compare = false;
     }
-    retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_);
+    retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
     if (num_data == num_data_) {
         printf("CompareHistograms reports %d errors\n", retval);
     } else {
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 5e3d9e650e4..abe1fd303c2 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -44,8 +44,7 @@ class CUDATreeLearner: public SerialTreeLearner {
     ~CUDATreeLearner();
     void Init(const Dataset* train_data, bool is_constant_hessian) override;
     void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
-    Tree* Train(const score_t* gradients, const score_t *hessians,
-                bool is_constant_hessian, const Json& forced_split_json);
+    Tree* Train(const score_t* gradients, const score_t *hessians);
     void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
       SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
       if (subset == nullptr && used_indices != nullptr) {
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index 689314fd07a..df90aafb945 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -734,9 +734,8 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
   SetupKernelArguments();
 }
 
-Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians,
-                           bool is_constant_hessian, const Json& forced_split_json) {
-  return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json);
+Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
+  return SerialTreeLearner::Train(gradients, hessians);
 }
 
 void GPUTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h
index 8568b7de014..a909c57cbad 100644
--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -48,8 +48,7 @@ class GPUTreeLearner: public SerialTreeLearner {
   void Init(const Dataset* train_data, bool is_constant_hessian) override;
   void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
   void ResetIsConstantHessian(bool is_constant_hessian) override;
-  Tree* Train(const score_t* gradients, const score_t *hessians,
-              bool is_constant_hessian, const Json& forced_split_json) override;
+  Tree* Train(const score_t* gradients, const score_t *hessians) override;
 
   void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
     SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 3f4e192fb7c..65f7fa3dd07 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -148,11 +148,10 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
   constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves));
 }
 
-Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) {
+Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
   Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
   gradients_ = gradients;
   hessians_ = hessians;
-  is_constant_hessian_ = is_constant_hessian;
   int num_threads = OMP_NUM_THREADS();
   if (share_state_->num_threads != num_threads && share_state_->num_threads > 0) {
     Log::Warning(
@@ -176,12 +175,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
   // only root leaf can be splitted on first time
   int right_leaf = -1;
 
-  int init_splits = 0;
-  bool aborted_last_force_split = false;
-  if (!forced_split_json.is_null()) {
-    init_splits = ForceSplits(tree_prt, forced_split_json, &left_leaf,
-                              &right_leaf, &cur_depth, &aborted_last_force_split);
-  }
+  int init_splits = ForceSplits(tree_prt, &left_leaf, &right_leaf, &cur_depth);
 
   for (int split = init_splits; split < config_->num_leaves - 1; ++split) {
     // some initial works before finding best split
@@ -438,10 +432,8 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
   }
 }
 
-int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf,
-                                       int* right_leaf, int *cur_depth,
-                                       bool *aborted_last_force_split) {
-  (void)aborted_last_force_split;
+int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
+                                       int* right_leaf, int *cur_depth) {
   bool abort_last_forced_split = false;
   if (forced_split_json_ == nullptr) {
     return 0;
@@ -450,7 +442,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
   // start at root leaf
   *left_leaf = 0;
   std::queue<std::pair<Json, int>> q;
-  Json left = forced_split_json;
+  Json left = *forced_split_json_;
   Json right;
   bool left_smaller = true;
   std::unordered_map<int, SplitInfo> forceSplitMap;
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 20f87bbf549..946d052d86d 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -78,8 +78,7 @@ class SerialTreeLearner: public TreeLearner {
     }
   }
 
-  Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian,
-              const Json& forced_split_json) override;
+  Tree* Train(const score_t* gradients, const score_t *hessians) override;
 
   Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
 
@@ -162,9 +161,8 @@ class SerialTreeLearner: public TreeLearner {
                   bool update_cnt);
 
   /* Force splits with forced_split_json dict and then return num splits forced.*/
-  virtual int32_t ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf,
-                              int* right_leaf, int* cur_depth,
-                              bool *aborted_last_force_split);
+  virtual int32_t ForceSplits(Tree* tree, int* left_leaf, int* right_leaf, 
+                              int* cur_depth);
 
   /*!
   * \brief Get the number of data in a leaf

From e0ad9d55139496de8ba880cf3642a4595d0fcf29 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 6 Jul 2020 14:10:54 +0000
Subject: [PATCH 081/119] Yet more lint and unneccessary changes.

---
 include/LightGBM/tree_learner.h | 2 +-
 src/boosting/gbdt.h             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index 2231f385c6e..e0fb3489057 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -53,7 +53,6 @@ class TreeLearner {
   * \brief training tree model on dataset
   * \param gradients The first order gradients
   * \param hessians The second order gradients
-  * \param is_constant_hessian True if all hessians share the same value
   * \return A trained tree
   */
   virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0;
@@ -68,6 +67,7 @@ class TreeLearner {
 
   /*!
   * \brief Set bagging data
+  * \param subset subset of bagging
   * \param used_indices Used data indices
   * \param num_data Number of used data
   */
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 1b82efc0cbf..be0e677310b 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -8,6 +8,7 @@
 #include <LightGBM/boosting.h>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/prediction_early_stop.h>
+#include <LightGBM/utils/json11.h>
 #include <LightGBM/utils/threading.h>
 
 #include <string>
@@ -21,7 +22,6 @@
 #include <utility>
 #include <vector>
 
-#include <LightGBM/utils/json11.h>
 #include "score_updater.hpp"
 
 #ifdef USE_CUDA

From 90709e6dacaa910063206190c95323e620740470 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 6 Jul 2020 14:26:28 +0000
Subject: [PATCH 082/119] Revert another change.

---
 src/treelearner/serial_tree_learner.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 946d052d86d..14a7b807d0c 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -161,8 +161,8 @@ class SerialTreeLearner: public TreeLearner {
                   bool update_cnt);
 
   /* Force splits with forced_split_json dict and then return num splits forced.*/
-  virtual int32_t ForceSplits(Tree* tree, int* left_leaf, int* right_leaf, 
-                              int* cur_depth);
+  int32_t ForceSplits(Tree* tree, int* left_leaf, int* right_leaf,
+                      int* cur_depth);
 
   /*!
   * \brief Get the number of data in a leaf

From 99e459b2d31bb46d7c5b81898b7266d0cedd9bcf Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 6 Jul 2020 17:29:55 +0000
Subject: [PATCH 083/119] Removal of unneccessary code.

---
 src/treelearner/cuda_tree_learner.cpp   | 10 ++--------
 src/treelearner/cuda_tree_learner.h     |  2 +-
 src/treelearner/serial_tree_learner.cpp |  1 -
 src/treelearner/serial_tree_learner.h   |  1 -
 4 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 97010367f81..7f32c23c398 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -911,7 +911,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     }
     if (num_data == num_data_) {
       if (share_state_->is_constant_hessian) {
-        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n");
+        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             0,
             num_data,
@@ -927,7 +927,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
       }
     } else {
       if (share_state_->is_constant_hessian) {
-        printf("ConstructHistogram(): is_constant_hessian_\n");
+        printf("ConstructHistogram(): is_constant_hessian\n");
         train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
             smaller_leaf_splits_->data_indices(),
             0,
@@ -978,12 +978,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     // We set data_indices to null to avoid rebuilding ordered gradients/hessians
 
     if (num_sparse_features > 0) {
-    // train_data_->ConstructHistograms(is_sparse_feature_used,
-    //  nullptr, larger_leaf_splits_->num_data_in_leaf(),
-    //  larger_leaf_splits_->leaf_index(),
-    //  ordered_bins_, gradients_, hessians_,
-    //  ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
-    //  ptr_larger_leaf_hist_data);
     train_data_->ConstructHistograms(is_sparse_feature_used,
       larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
       gradients_, hessians_,
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index abe1fd303c2..46e31985a5d 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -117,7 +117,7 @@ class CUDATreeLearner: public SerialTreeLearner {
       td->leaf_num_data         = leaf_num_data;
       td->num_data              = num_data_;
       td->use_all_features      = use_all_features;
-      td->is_constant_hessian   = is_constant_hessian_;
+      td->is_constant_hessian   = share_state_->is_constant_hessian;
       td->num_workgroups        = num_workgroups;
       td->stream                = stream_[device_id];
       td->device_features       = device_features_[device_id];
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 65f7fa3dd07..d076e4afd2f 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -29,7 +29,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   num_features_ = train_data_->num_features();
-  is_constant_hessian_ = is_constant_hessian;
   int max_cache_size = 0;
   // Get the max size of pool
   if (config_->histogram_pool_size <= 0) {
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 14a7b807d0c..79882ded79e 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -225,7 +225,6 @@ class SerialTreeLearner: public TreeLearner {
   const Json* forced_split_json_;
   std::unique_ptr<TrainingShareStates> share_state_;
   std::unique_ptr<CostEfficientGradientBoosting> cegb_;
-  bool is_constant_hessian_;
 };
 
 inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {

From f40d77b7fea05d428d5f9aafe269d53ebf9548ca Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 7 Jul 2020 13:29:57 +0000
Subject: [PATCH 084/119] temporary appveyor.yml for building and testing

---
 appveyor.yml | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 appveyor.yml

diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 00000000000..b4c0131a9af
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,50 @@
+version: 2.3.2.{build}
+
+image: Visual Studio 2015
+platform: x64
+configuration:  # a trick to construct a build matrix with multiple Python versions
+  - 3.7
+
+# only build pull requests and
+# commits to 'cuda'
+branches:
+  only:
+    - cuda
+
+environment:
+  matrix:
+    - COMPILER: MSVC
+      TASK: python
+    - COMPILER: MINGW
+      TASK: python
+
+clone_depth: 5
+
+init:
+  - ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1'))
+
+install:
+  - git submodule update --init --recursive  # get `compute` folder
+  - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%  # delete sh.exe from PATH (mingw32-make fix)
+  - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH%
+  - set PYTHON_VERSION=%CONFIGURATION%
+  - set CONDA_ENV="test-env"
+  - ps: >-
+      switch ($env:PYTHON_VERSION) {
+          "2.7" {$env:MINICONDA = "C:\Miniconda-x64"}
+          "3.5" {$env:MINICONDA = "C:\Miniconda35-x64"}
+          "3.6" {$env:MINICONDA = "C:\Miniconda36-x64"}
+          "3.7" {$env:MINICONDA = "C:\Miniconda37-x64"}
+          default {$env:MINICONDA = "C:\Miniconda37-x64"}
+      }
+      $env:PATH="$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH"
+  - ps: $env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim()
+
+build: false
+
+test_script:
+  - conda init powershell
+  - powershell.exe -ExecutionPolicy Bypass -File %APPVEYOR_BUILD_FOLDER%\.ci\test_windows.ps1
+
+on_finish:
+  - ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1'))

From d900b6484159f315b2fb86feb2f71e9280fe95a6 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 7 Jul 2020 16:54:03 +0000
Subject: [PATCH 085/119] Remove return value in ReSize

---
 include/LightGBM/dataset.h | 4 +---
 src/io/dataset.cpp         | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 0fd0dfc6d15..d86c0e3d7cb 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -440,9 +440,7 @@ class Dataset {
     }
     return ret;
   }
-
-  // LGBM_CUDA ReSize() returns true if resized
-  bool ReSize(data_size_t num_data);
+  void ReSize(data_size_t num_data);
 
   void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
 
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 054c17e0a01..1b94e232802 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -771,8 +771,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
   forced_bin_bounds_ = dataset->forced_bin_bounds_;
 }
 
-// LGBM_CUDA Resize() returns boolean
-bool Dataset::ReSize(data_size_t num_data) {
+void Dataset::ReSize(data_size_t num_data) {
   if (num_data_ != num_data) {
     num_data_ = num_data;
     OMP_INIT_EX();

From 361720b30cf6a31466632a73e43f69be5329a915 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 7 Jul 2020 17:58:47 +0000
Subject: [PATCH 086/119] Removal of unused variables.

---
 src/boosting/gbdt.cpp | 3 ---
 src/boosting/gbdt.h   | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 8cec5796127..9473d2c0ebd 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -803,9 +803,6 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
       if (tmp_subset_ == nullptr) {
         tmp_subset_.reset(new Dataset(bag_data_cnt_));
         tmp_subset_->CopyFeatureMapperFrom(train_data_);
-        size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-        tmp_gradients_.resize(total_size);
-        tmp_hessians_.resize(total_size);
         is_use_subset_ = false;
         bag_data_indices_.clear();
       }
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index be0e677310b..583a680fe51 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -471,17 +471,13 @@ class GBDT : public GBDTBase {
 #ifdef USE_CUDA
   /*! \brief First order derivative of training data */
   std::vector<score_t, CHAllocator<score_t>> gradients_;  // LGBM_CUDA
-  std::vector<score_t, CHAllocator<score_t>> tmp_gradients_;  // LGBM_CUDA
   /*! \brief Second order derivative of training data */
   std::vector<score_t, CHAllocator<score_t>> hessians_;  // LGBM_CUDA
-  std::vector<score_t, CHAllocator<score_t>> tmp_hessians_;  // LGBM_CUDA
 #else
   /*! \brief First order derivative of training data */
   std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> gradients_;
-  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> tmp_gradients_;
   /*! \brief Second order derivative of training data */
   std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> hessians_;
-  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> tmp_hessians_;
 #endif
 
   /*! \brief Store the indices of in-bag data */

From a8b42459978d29d6edb639a69eb0865dae428c43 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 13 Jul 2020 17:36:33 +0000
Subject: [PATCH 087/119] Code cleanup from reviewers suggestions.

---
 CMakeLists.txt                                    |  2 +-
 include/LightGBM/application.h                    |  1 -
 include/LightGBM/bin.h                            |  1 -
 include/LightGBM/config.h                         |  4 +---
 include/LightGBM/cuda/cuda_utils.h                |  2 --
 include/LightGBM/cuda/vector_cudahost.h           |  4 ++--
 include/LightGBM/dataset.h                        |  2 --
 include/LightGBM/feature_group.h                  |  2 --
 python-package/setup.py                           | 15 ++-------------
 src/application/application.cpp                   |  6 +-----
 src/boosting/gbdt.cpp                             |  4 +---
 src/boosting/gbdt.h                               |  9 +++------
 src/c_api.cpp                                     |  6 +-----
 src/io/config.cpp                                 |  4 +---
 src/io/config_auto.cpp                            |  7 ++-----
 src/io/dataset.cpp                                |  8 ++------
 src/io/dense_bin.hpp                              | 11 +++--------
 src/io/sparse_bin.hpp                             |  1 -
 src/main.cpp                                      |  7 -------
 src/treelearner/data_parallel_tree_learner.cpp    |  2 +-
 src/treelearner/feature_parallel_tree_learner.cpp |  2 +-
 src/treelearner/tree_learner.cpp                  |  4 ++--
 src/treelearner/voting_parallel_tree_learner.cpp  |  2 +-
 23 files changed, 25 insertions(+), 81 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e237da69c9..aae29463f94 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ OPTION(USE_SWIG "Enable SWIG to generate Java API" OFF)
 OPTION(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
 OPTION(USE_R35 "Set to ON if your R version is not earlier than 3.5" OFF)
 OPTION(USE_TIMETAG "Set to ON to output time costs" OFF)
-OPTION(USE_CUDA "Enable CUDA-accelerated training" OFF)
+OPTION(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF)
 OPTION(USE_DEBUG "Set to ON for Debug mode" OFF)
 OPTION(BUILD_FOR_R "Set to ON if building lib_lightgbm for use with the R package" OFF)
 
diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h
index 7ce8956a555..3fda4a1c32e 100644
--- a/include/LightGBM/application.h
+++ b/include/LightGBM/application.h
@@ -36,7 +36,6 @@ class Application {
   /*! \brief To call this function to run application*/
   inline void Run();
 
-  // LGBM_CUDA
   /*! \brief call to get configuration */
   Config GetConfig() {return config_ ;}
 
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index e541e7039e9..fcf8f1add47 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -288,7 +288,6 @@ class Bin {
   /*! \brief Number of all data */
   virtual data_size_t num_data() const = 0;
 
-  // LGBM_CUDA
   /*! \brief Get data pointer */
   virtual void* get_data() = 0;
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 162c7583dc7..36219675da5 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -954,11 +954,9 @@ struct Config {
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   bool gpu_use_dp = false;
 
-#ifdef USE_CUDA
-  // desc = number of gpus (CUDA implementation only) LGBM_CUDA
+  // desc = number of gpus (CUDA implementation only)
   // desc = default value is 1
   int num_gpu = 1;
-#endif
 
   #pragma endregion
 
diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index 2fb45384f0c..af787315559 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -5,8 +5,6 @@
 #ifndef LGBM_CUDA_UTILS_H
 #define LGBM_CUDA_UTILS_H
 
-// LGBM_CUDA
-
 #ifdef USE_CUDA
 
 #include <cuda.h>
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index 41a27c349bd..d73fabe25fa 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -5,12 +5,12 @@
 #ifndef LGBM_CUDA_VECTOR_CH_H
 #define LGBM_CUDA_VECTOR_CH_H
 
+#ifdef USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 #include <stdio.h>
 
-// LGBM_CUDA
-
 namespace LightGBM {
 
 #define lgbm_device_cpu 0
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index d86c0e3d7cb..0f683b9817c 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -589,12 +589,10 @@ class Dataset {
     return feature_groups_[i]->is_multi_val_;
   }
 
-  // LGBM_CUDA
   inline size_t FeatureGroupSizesInByte(int group) const {
     return feature_groups_[group]->FeatureGroupSizesInByte();
   }
 
-  // LGBM_CUDA
   inline void* FeatureGroupData(int group) const {
     return feature_groups_[group]->FeatureGroupData();
   }
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index d949beec20e..2e0db94f19c 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -228,12 +228,10 @@ class FeatureGroup {
     return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
   }
 
-  // LGBM_CUDA
   inline size_t FeatureGroupSizesInByte() {
     return bin_data_->SizesInByte();
   }
 
-  // LGBM_CUDA
   inline void* FeatureGroupData() {
     return bin_data_->get_data();
   }
diff --git a/python-package/setup.py b/python-package/setup.py
index 1e0500f180c..36abaec4a96 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -90,7 +90,6 @@ def compile_cpp(use_mingw=False, use_gpu=False, use_cuda=False, use_mpi=False,
                 use_hdfs=False, boost_root=None, boost_dir=None,
                 boost_include_dir=None, boost_librarydir=None,
                 opencl_include_dir=None, opencl_library=None,
-                openmp_include_dir=None, openmp_library=None,
                 nomp=False, bit32=False):
 
     if os.path.exists(os.path.join(CURRENT_DIR, "build_cpp")):
@@ -117,10 +116,6 @@ def compile_cpp(use_mingw=False, use_gpu=False, use_cuda=False, use_mpi=False,
             cmake_cmd.append("-DOpenCL_LIBRARY={0}".format(opencl_library))
     elif use_cuda:
         cmake_cmd.append("-DUSE_CUDA=ON")
-        if openmp_include_dir:
-            cmake_cmd.append("-DOpenMP_INCLUDE_DIR={0}".format(openmp_include_dir))
-        if openmp_library:
-            cmake_cmd.append("-DOpenMP_LIBRARY={0}".format(openmp_library))
     if use_mpi:
         cmake_cmd.append("-DUSE_MPI=ON")
     if nomp:
@@ -205,9 +200,7 @@ class CustomInstall(install):
         ('boost-include-dir=', None, 'Directory containing Boost headers'),
         ('boost-librarydir=', None, 'Preferred Boost library directory'),
         ('opencl-include-dir=', None, 'OpenCL include directory'),
-        ('opencl-library=', None, 'Path to OpenCL library'),
-        ('openmp-include-dir=', None, 'OpenMP include directory'),
-        ('openmp-library=', None, 'Path to OpenMP library')
+        ('opencl-library=', None, 'Path to OpenCL library')
     ]
 
     def initialize_options(self):
@@ -221,12 +214,9 @@ def initialize_options(self):
         self.boost_librarydir = None
         self.opencl_include_dir = None
         self.opencl_library = None
-        self.openmp_include_dir = None
-        self.openmp_library = None
         self.mpi = 0
         self.hdfs = 0
-        # self.precompile = 0 # TODO: revert this
-        self.precompile = 1
+        self.precompile = 0
         self.nomp = 0
         self.bit32 = 0
 
@@ -245,7 +235,6 @@ def run(self):
                         use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir,
                         boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir,
                         opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library,
-                        openmp_include_dir=self.openmp_include_dir, openmp_library=self.openmp_library,
                         nomp=self.nomp, bit32=self.bit32)
         install.run(self)
         if os.path.isfile(LOG_PATH):
diff --git a/src/application/application.cpp b/src/application/application.cpp
index 5c61b323654..e88f5c86188 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -14,6 +14,7 @@
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/text_reader.h>
+#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <string>
 #include <chrono>
@@ -25,10 +26,6 @@
 
 #include "predictor.hpp"
 
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#endif
-
 namespace LightGBM {
 
 Common::Timer global_timer;
@@ -43,7 +40,6 @@ Application::Application(int argc, char** argv) {
     Log::Fatal("No training/prediction data, application quit");
   }
 
-// LGBM_CUDA
 #ifdef USE_CUDA
   if (config_.device_type == std::string("cuda")) {
       LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 9473d2c0ebd..467ab922e35 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -63,10 +63,8 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   es_first_metric_only_ = config_->first_metric_only;
   shrinkage_rate_ = config_->learning_rate;
 
-// LGBM_CUDA
 #ifdef USE_CUDA
   if (config_->device_type == std::string("cuda")) {
-    // LGBM_config_::current_device = lgbm_device_cuda; moved to application.cpp
     LGBM_config_::current_learner = use_cuda_learner;
   }
 #endif
@@ -799,7 +797,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
     }
   } else {
     bag_data_cnt_ = num_data_;
-    if (config_->device_type == std::string("cuda")) {  // LGBM_CUDA
+    if (config_->device_type == std::string("cuda")) {
       if (tmp_subset_ == nullptr) {
         tmp_subset_.reset(new Dataset(bag_data_cnt_));
         tmp_subset_->CopyFeatureMapperFrom(train_data_);
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 583a680fe51..9567786769a 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -10,6 +10,7 @@
 #include <LightGBM/prediction_early_stop.h>
 #include <LightGBM/utils/json11.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <string>
 #include <algorithm>
@@ -24,10 +25,6 @@
 
 #include "score_updater.hpp"
 
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>  // LGBM_CUDA
-#endif
-
 namespace LightGBM {
 
 using json11::Json;
@@ -470,9 +467,9 @@ class GBDT : public GBDTBase {
 
 #ifdef USE_CUDA
   /*! \brief First order derivative of training data */
-  std::vector<score_t, CHAllocator<score_t>> gradients_;  // LGBM_CUDA
+  std::vector<score_t, CHAllocator<score_t>> gradients_;
   /*! \brief Second order derivative of training data */
-  std::vector<score_t, CHAllocator<score_t>> hessians_;  // LGBM_CUDA
+  std::vector<score_t, CHAllocator<score_t>> hessians_;
 #else
   /*! \brief First order derivative of training data */
   std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> gradients_;
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 6cdebc34aed..050e605b268 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -17,6 +17,7 @@
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <string>
 #include <cstdio>
@@ -28,10 +29,6 @@
 
 #include "application/predictor.hpp"
 
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#endif
-
 namespace LightGBM {
 
 inline int LGBM_APIHandleException(const std::exception& ex) {
@@ -43,7 +40,6 @@ inline int LGBM_APIHandleException(const std::string& ex) {
   return -1;
 }
 
-// LGBM_CUDA
 inline void AdditionalConfig(Config *config) {
 #ifdef USE_CUDA
   if (config->device_type == std::string("cuda")) {
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 7ee8a74487f..b354bf10f03 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -126,7 +126,7 @@ void GetDeviceType(const std::unordered_map<std::string, std::string>& params, s
       *device_type = "cpu";
     } else if (value == std::string("gpu")) {
       *device_type = "gpu";
-    } else if (value == std::string("cuda")) {  // LGBM_CUDA
+    } else if (value == std::string("cuda")) {
       *device_type = "cuda";
     } else {
       Log::Fatal("Unknown device type %s", value.c_str());
@@ -327,13 +327,11 @@ void Config::CheckParamConflict() {
     force_row_wise = false;
   }
 
-#ifdef USE_CUDA
   // force col-wise for CUDA
   if (device_type == std::string("cuda")) {
     force_col_wise = true;
     force_row_wise = false;
   }
-#endif
 
   // min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated
   // the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 6cd26c58c16..d0bb97e1942 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -294,9 +294,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "gpu_platform_id",
   "gpu_device_id",
   "gpu_use_dp",
-#ifdef USE_CUDA
-  "num_gpu", /* LGBM_CUDA */
-#endif
+  "num_gpu",
   });
   return params;
 }
@@ -610,11 +608,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
   GetBool(params, "gpu_use_dp", &gpu_use_dp);
 #ifdef USE_CUDA
   gpu_use_dp = true;  /* LGBM_CUDA hard-coding gpu_use_dp to TRUE (default is false) */
+#endif
 
-  /* LGBM_CUDA get number of GPUs */
   GetInt(params, "num_gpu", &num_gpu);
   CHECK_GT(num_gpu, 0);
-#endif
 }
 
 std::string Config::SaveMembersToString() const {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 1b94e232802..817480d5c50 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -9,6 +9,7 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <chrono>
 #include <cstdio>
@@ -16,10 +17,6 @@
 #include <sstream>
 #include <unordered_map>
 
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#endif
-
 namespace LightGBM {
 
 const char* Dataset::binary_file_token =
@@ -339,7 +336,6 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
   }
   auto features_in_group = NoGroup(used_features);
 
-// LGBM_CUDA
 #ifdef USE_CUDA
   if (io_config.device_type == std::string("cuda")) {
       LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
@@ -348,7 +344,7 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
 
   std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
   if (io_config.enable_bundle && !used_features.empty()) {
-    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");  // LGBM_CUDA
+    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");
     features_in_group = FastFeatureBundling(
         *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
         num_sample_col, static_cast<data_size_t>(total_sample_cnt),
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index f0405bc318e..48f0c4dc587 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -7,17 +7,13 @@
 #define LIGHTGBM_IO_DENSE_BIN_HPP_
 
 #include <LightGBM/bin.h>
+#include <LightGBM/cuda/vector_cudahost.h>
+#include <LightGBM/utils/openmp_wrapper.h>
 
 #include <cstdint>
 #include <cstring>
 #include <vector>
 
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>  // LGBM_CUDA
-#endif
-
-#include <LightGBM/utils/openmp_wrapper.h>  // LGBM_CUDA
-
 namespace LightGBM {
 
 template <typename VAL_T, bool IS_4BIT>
@@ -368,7 +364,6 @@ class DenseBin : public Bin {
 
   data_size_t num_data() const override { return num_data_; }
 
-  // LGBM_CUDA
   void* get_data() override { return data_.data(); }
 
   void FinishLoad() override {
@@ -466,7 +461,7 @@ class DenseBin : public Bin {
  private:
   data_size_t num_data_;
 #ifdef USE_CUDA
-  std::vector<VAL_T, CHAllocator<VAL_T>> data_;  // LGBM_CUDA
+  std::vector<VAL_T, CHAllocator<VAL_T>> data_;
 #else
   std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
 #endif
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index c56cd6da99d..7476f9a0c24 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -408,7 +408,6 @@ class SparseBin : public Bin {
 
   data_size_t num_data() const override { return num_data_; }
 
-  // LGBM_CUDA
   void* get_data() override { return nullptr; }
 
   void FinishLoad() override {
diff --git a/src/main.cpp b/src/main.cpp
index ef277ac0c1f..8034da82681 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,10 +11,6 @@
 int main(int argc, char** argv) {
   bool success = false;
   try {
-    // LGBM_CUDA
-    std::chrono::duration<double, std::milli> main_time;
-    auto start_main_time = std::chrono::steady_clock::now();
-
     LightGBM::Application app(argc, argv);
     app.Run();
 
@@ -22,9 +18,6 @@ int main(int argc, char** argv) {
     LightGBM::Linkers::MpiFinalizeIfIsParallel();
 #endif
 
-    // LGBM_CUDA
-    main_time = std::chrono::steady_clock::now() - start_main_time;
-    LightGBM::Log::Info("main::main time: %f sec", main_time * 1e-3);
     success = true;
   }
   catch (const std::exception& ex) {
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index 70e6d98354f..30d8df84acf 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -256,7 +256,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, in
 }
 
 // instantiate template classes, otherwise linker cannot find the code
-template class DataParallelTreeLearner<CUDATreeLearner>;  // LGBM_CUDA
+template class DataParallelTreeLearner<CUDATreeLearner>;
 template class DataParallelTreeLearner<GPUTreeLearner>;
 template class DataParallelTreeLearner<SerialTreeLearner>;
 
diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp
index 69809e6069c..f4edfe03dc1 100644
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -77,7 +77,7 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
 }
 
 // instantiate template classes, otherwise linker cannot find the code
-template class FeatureParallelTreeLearner<CUDATreeLearner>;  // LGBM_CUDA
+template class FeatureParallelTreeLearner<CUDATreeLearner>;
 template class FeatureParallelTreeLearner<GPUTreeLearner>;
 template class FeatureParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index d47b469f950..63ca1b2de83 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -5,7 +5,7 @@
 #include <LightGBM/tree_learner.h>
 
 #include "gpu_tree_learner.h"
-#include "cuda_tree_learner.h"  // LGBM_CUDA
+#include "cuda_tree_learner.h"
 #include "parallel_tree_learner.h"
 #include "serial_tree_learner.h"
 
@@ -32,7 +32,7 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
     } else if (learner_type == std::string("voting")) {
       return new VotingParallelTreeLearner<GPUTreeLearner>(config);
     }
-  } else if (device_type == std::string("cuda")) {  // LGBM_CUDA
+  } else if (device_type == std::string("cuda")) {
     if (learner_type == std::string("serial")) {
       return new CUDATreeLearner(config);
     } else if (learner_type == std::string("feature")) {
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 265e94757aa..51ee2096380 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -454,7 +454,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
 }
 
 // instantiate template classes, otherwise linker cannot find the code
-template class VotingParallelTreeLearner<CUDATreeLearner>;  // LGBM_CUDA
+template class VotingParallelTreeLearner<CUDATreeLearner>;
 template class VotingParallelTreeLearner<GPUTreeLearner>;
 template class VotingParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM

From ac5f7b8b8fde7f113d7d021d6e15e22e7fc0dca4 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 13 Jul 2020 19:38:22 +0000
Subject: [PATCH 088/119] Removal of FIXME comments and unused defines.

---
 src/treelearner/cuda_tree_learner.cpp         |  4 +-
 src/treelearner/cuda_tree_learner.h           |  1 -
 .../kernels/histogram_16_64_256.cu            | 42 -------------------
 .../kernels/histogram_16_64_256.hu            | 20 ---------
 4 files changed, 2 insertions(+), 65 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 7f32c23c398..0538b849e34 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -357,7 +357,7 @@ void CUDATreeLearner::AllocateGPUMemory() {
     if (num_gpu_feature_groups) {
       CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
 
-      // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers)
+      // allocate memory for all features
       if ( device_features_[device_id] != NULL ) {
              CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id]));
       }
@@ -814,7 +814,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
 
   // if not all feature groups are used, we need to transfer the feature mask to GPU
   // otherwise, we will use a specialized GPU kernel with all feature groups enabled
-  // LGBM_CUDA FIXME: No waiting mark for feature mask
+  // LGBM_CUDA 
 
   // LGBM_CUDA We now copy even if all features are used.
 
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 46e31985a5d..034a31f6d3a 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -243,7 +243,6 @@ class CUDATreeLearner: public SerialTreeLearner {
     // char *device_subhistograms_;
     std::vector<char*> device_subhistograms_;
     /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
-    // FIXME: is this cuda mapped
     // void *device_histogram_outputs_;
     std::vector<void*> device_histogram_outputs_;
     /*! \brief Host memory pointer for histogram outputs */
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 23e9b150a1f..994a3c94170 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -105,7 +105,6 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
                       const size_t power_feature_workgroups) {
 #else
 __global__ void KERNEL_NAME(const uchar* feature_data_base,
-                      // FIXME: how to handle this __constant
                       const uchar* __restrict__ feature_masks,
                       const data_size_t feature_size,
                       const data_size_t* data_indices,
@@ -293,18 +292,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *__restrict__ ptr_f = output;
     for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
         // even threads read gradients, odd threads read hessians
-        // FIXME: 2-way bank conflict
         acc_type value = gh_hist[i];
         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
     }
     // write counts
     acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-        // FIXME: 2-way bank conflict
         uint value = cnt_hist[i];
         ptr_i[i] = value;
     }
-    // FIXME: is this right
     __syncthreads();
     __threadfence();
     // To avoid the cost of an extra reducting kernel, we have to deal with some
@@ -312,19 +308,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // make the final reduction, and other threads will just quit.
     // This requires that the results written by other workgroups available to the
     // last workgroup (memory consistency)
-    #if NVIDIA == 1
     // this is equavalent to CUDA __threadfence();
     // ensure the writes above goes to main memory and other workgroups can see it
     asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
-    #else
-    // FIXME: how to do the above on AMD GPUs??
-    // GCN ISA says that the all writes will bypass L1 cache (write through),
-    // however when the last thread is reading sub-histogram data we have to
-    // make sure that no part of data is modified in local L1 cache of other workgroups.
-    // Otherwise reading can be a problem (atomic operations to get consistency).
-    // But in our case, the sub-histogram of this workgroup cannot be in the cache
-    // of another workgroup, so the following trick will work just fine.
-    #endif
     // Now, we want one workgroup to do the final reduction.
     // Other workgroups processing the same feature quit.
     // The is done by using an global atomic counter.
@@ -451,7 +437,6 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
                       const size_t power_feature_workgroups) {
 #else
 __global__ void KERNEL_NAME(const uchar* feature_data_base,
-                      // FIXME: how to handle this __constant
                       const uchar* __restrict__ feature_masks,
                       const data_size_t feature_size,
                       const data_size_t* data_indices,
@@ -637,18 +622,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *__restrict__ ptr_f = output;
     for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
         // even threads read gradients, odd threads read hessians
-        // FIXME: 2-way bank conflict
         acc_type value = gh_hist[i];
         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
     }
     // write counts
     acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-        // FIXME: 2-way bank conflict
         uint value = cnt_hist[i];
         ptr_i[i] = value;
     }
-    // FIXME: is this right
     __syncthreads();
     __threadfence();
     // To avoid the cost of an extra reducting kernel, we have to deal with some
@@ -656,19 +638,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // make the final reduction, and other threads will just quit.
     // This requires that the results written by other workgroups available to the
     // last workgroup (memory consistency)
-    #if NVIDIA == 1
     // this is equavalent to CUDA __threadfence();
     // ensure the writes above goes to main memory and other workgroups can see it
     asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
-    #else
-    // FIXME: how to do the above on AMD GPUs??
-    // GCN ISA says that the all writes will bypass L1 cache (write through),
-    // however when the last thread is reading sub-histogram data we have to
-    // make sure that no part of data is modified in local L1 cache of other workgroups.
-    // Otherwise reading can be a problem (atomic operations to get consistency).
-    // But in our case, the sub-histogram of this workgroup cannot be in the cache
-    // of another workgroup, so the following trick will work just fine.
-    #endif
     // Now, we want one workgroup to do the final reduction.
     // Other workgroups processing the same feature quit.
     // The is done by using an global atomic counter.
@@ -796,7 +768,6 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
                       const size_t power_feature_workgroups) {
 #else
 __global__ void KERNEL_NAME(const uchar* feature_data_base,
-                      // FIXME: how to handle this __constant
                       const uchar* __restrict__ feature_masks,
                       const data_size_t feature_size,
                       const data_size_t* data_indices,
@@ -982,18 +953,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *__restrict__ ptr_f = output;
     for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
         // even threads read gradients, odd threads read hessians
-        // FIXME: 2-way bank conflict
         acc_type value = gh_hist[i];
         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
     }
     // write counts
     acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
     for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-        // FIXME: 2-way bank conflict
         uint value = cnt_hist[i];
         ptr_i[i] = value;
     }
-    // FIXME: is this right
     __syncthreads();
     __threadfence();
     // To avoid the cost of an extra reducting kernel, we have to deal with some
@@ -1001,19 +969,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // make the final reduction, and other threads will just quit.
     // This requires that the results written by other workgroups available to the
     // last workgroup (memory consistency)
-    #if NVIDIA == 1
     // this is equavalent to CUDA __threadfence();
     // ensure the writes above goes to main memory and other workgroups can see it
     asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
-    #else
-    // FIXME: how to do the above on AMD GPUs??
-    // GCN ISA says that the all writes will bypass L1 cache (write through),
-    // however when the last thread is reading sub-histogram data we have to
-    // make sure that no part of data is modified in local L1 cache of other workgroups.
-    // Otherwise reading can be a problem (atomic operations to get consistency).
-    // But in our case, the sub-histogram of this workgroup cannot be in the cache
-    // of another workgroup, so the following trick will work just fine.
-    #endif
     // Now, we want one workgroup to do the final reduction.
     // Other workgroups processing the same feature quit.
     // The is done by using an global atomic counter.
diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
index 4dfcb9f7895..86400ae84e6 100644
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ b/src/treelearner/kernels/histogram_16_64_256.hu
@@ -77,34 +77,14 @@ typedef uint acc_int_type;
 #define as_acc_int_type as_uint
 #endif
 
-// unroll the atomic operation for a few times. Takes more code space, 
-// but compiler can generate better code for faster atomics.
-#define UNROLL_ATOMIC 1
-
-// Options passed by compiler at run time:
-// IGNORE_INDICES will be set when the kernel does not 
-//#define IGNORE_INDICES
-//#define POWER_FEATURE_WORKGROUPS 10
-
-// detect Nvidia platforms
-#ifdef cl_nv_pragma_unroll
-#define NVIDIA 1
-#endif
-
 // use all features and do not use feature mask
 #ifndef ENABLE_ALL_FEATURES
 #define ENABLE_ALL_FEATURES 1
 #endif
 
-// use binary patching for AMD GCN 1.2 or newer
-#ifndef AMD_USE_DS_ADD_F32
-#define AMD_USE_DS_ADD_F32 0
-#endif
-
 typedef uint data_size_t;
 typedef float score_t;
 
-
 // define all of the different kernels
 
 #define DECLARE_CONST_BUF(name) \

From 63d75e978963d308b59eac0f2e7c02bbb256e891 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 13 Jul 2020 20:36:19 +0000
Subject: [PATCH 089/119] More reviewers comments cleanup.

---
 include/LightGBM/cuda/cuda_utils.h            |   4 +-
 include/LightGBM/cuda/vector_cudahost.h       |   4 +-
 src/treelearner/cuda_tree_learner.cpp         | 115 ++++--------------
 src/treelearner/cuda_tree_learner.h           |  10 +-
 .../kernels/histogram_16_64_256.hu            |   4 +-
 src/treelearner/serial_tree_learner.h         |   8 +-
 6 files changed, 35 insertions(+), 110 deletions(-)

diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index af787315559..3c0264cb396 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2020 IBM Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#ifndef LGBM_CUDA_UTILS_H
-#define LGBM_CUDA_UTILS_H
+#ifndef LIGHTGBM_CUDA_UTILS_H_
+#define LIGHTGBM_CUDA_UTILS_H_
 
 #ifdef USE_CUDA
 
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index d73fabe25fa..a5d97370261 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2020 IBM Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#ifndef LGBM_CUDA_VECTOR_CH_H
-#define LGBM_CUDA_VECTOR_CH_H
+#ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
+#define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
 
 #ifdef USE_CUDA
 #include <cuda.h>
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 0538b849e34..4183019ed85 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -48,11 +48,6 @@ static void *launch_cuda_histogram(void *thread_data) {
   return NULL;
 }
 
-/*
-static void *wait_event(void *wait_obj) {
-  CUDASUCCESS_OR_FATAL(cudaEventSynchronize(*(cudaEvent_t *)wait_obj));
-}*/
-
 namespace LightGBM {
 
 CUDATreeLearner::CUDATreeLearner(const Config* config)
@@ -60,9 +55,9 @@ CUDATreeLearner::CUDATreeLearner(const Config* config)
   use_bagging_ = false;
   nthreads_ = 0;
   if (config->gpu_use_dp && USE_DP_FLOAT) {
-    Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!");
+    Log::Info("LightGBM using CUDA trainer with DP float!!");
   } else {
-    Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!");
+    Log::Info("LightGBM using CUDA trainer with SP float!!");
   }
 }
 
@@ -78,7 +73,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian)
   num_feature_groups_ = train_data_->num_feature_groups();
 
   // Initialize GPU buffers and kernels & LGBM_CUDA: get device info
-  InitGPU(config_->num_gpu);  // LGBM_CUDA
+  InitGPU(config_->num_gpu);
 }
 
 // some functions used for debugging the GPU histogram construction
@@ -192,7 +187,6 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
   // set work group size based on feature size
   // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
 
-
   int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
   std::vector<int> num_gpu_workgroups;
   ThreadData *thread_data = reinterpret_cast<ThreadData*>(malloc(sizeof(ThreadData) * num_gpu_));
@@ -245,11 +239,8 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra
 
   #pragma omp parallel for schedule(static, num_gpu_)
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    // auto start_time = std::chrono::steady_clock::now();
-
     // when the output is ready, the computation is done
     CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id]));
-    // LGBM_CUDA
   }
 
   HistType* histograms = reinterpret_cast<HistType*>(leaf_histogram_array[0].RawData() - kHistOffset);
@@ -269,7 +260,6 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra
   }
 }
 
-// LGBM_CUDA
 void CUDATreeLearner::CountDenseFeatureGroups() {
   num_dense_feature_groups_ = 0;
 
@@ -283,7 +273,6 @@ void CUDATreeLearner::CountDenseFeatureGroups() {
   }
 }
 
-// LGBM_CUDA
 void CUDATreeLearner::prevAllocateGPUMemory() {
   // how many feature-group tuples we have
   // leave some safe margin for prefetching
@@ -292,7 +281,6 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
   allocated_num_data_ = std::max(num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature), allocated_num_data_);
 
   // clear sparse/dense maps
-
   dense_feature_group_map_.clear();
   sparse_feature_group_map_.clear();
 
@@ -315,18 +303,6 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
     offset += num_gpu_feature_groups_.at(i);
   }
 
-#if 0
-  // allocate feature mask, for disabling some feature-groups' histogram calculation
-  if (feature_masks_.data() != NULL) {
-    cudaPointerAttributes attributes;
-    cudaPointerGetAttributes(&attributes, feature_masks_.data());
-
-    if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
-      CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data()));
-    }
-  }
-#endif
-
   feature_masks_.resize(num_dense_feature_groups_);
   Log::Debug("Resized feature masks");
 
@@ -337,12 +313,8 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
   // histogram bin entry size depends on the precision (single/double)
   hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t));  // two elements in this "size"
 
-  // host_size histogram outputs
-  //  host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_);
-
   CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast<void **>(&host_histogram_outputs_), (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable));
 
-  // LGBM_CUDA
   nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_);
   nthreads_ = std::max(nthreads_, 1);
 }
@@ -367,7 +339,6 @@ void CUDATreeLearner::AllocateGPUMemory() {
 
       // allocate space for gradients and hessians on device
       // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed
-
       if (device_gradients_[device_id] != NULL) {
         CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id]));
       }
@@ -386,10 +357,9 @@ void CUDATreeLearner::AllocateGPUMemory() {
       CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), (size_t) num_gpu_feature_groups));
 
       // copy indices to the device
-
-     if (device_data_indices_[device_id] != NULL) {
-       CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id]));
-     }
+      if (device_data_indices_[device_id] != NULL) {
+        CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id]));
+      }
 
       CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), (size_t) allocated_num_data_ * sizeof(data_size_t)));
       CUDASUCCESS_OR_FATAL(cudaMemsetAsync(device_data_indices_[device_id], 0, allocated_num_data_ * sizeof(data_size_t), stream_[device_id]));
@@ -398,7 +368,6 @@ void CUDATreeLearner::AllocateGPUMemory() {
 
       // create output buffer, each feature has a histogram with device_bin_size_ bins,
       // each work group generates a sub-histogram of dword_features_ features.
-
       if (!device_subhistograms_[device_id]) {
         // only initialize once here, as this will not need to change when ResetTrainingData() is called
         CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)));
@@ -424,14 +393,12 @@ void CUDATreeLearner::ResetGPUMemory() {
   sparse_feature_group_map_.clear();
 }
 
-// LGBM_CUDA
 void CUDATreeLearner::copyDenseFeature() {
   if (num_feature_groups_ == 0) {
     LGBM_config_::current_learner = use_cpu_learner;
     return;
   }
 
-//  auto start_time = std::chrono::steady_clock::now();
   Log::Debug("Started copying dense features from CPU to GPU");
   // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
   size_t  copied_feature = 0;
@@ -465,9 +432,6 @@ void CUDATreeLearner::copyDenseFeature() {
       sparse_feature_group_map_.push_back(i);
     }
   }
-
-  // data transfer time // LGBM_CUDA: async copy, so it is not the real data transfer time
-  // std::chrono::duration<double, std::milli> end_time = std::chrono::steady_clock::now() - start_time;
 }
 
 
@@ -495,18 +459,18 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   #endif
 
   if (max_num_bin_ <= 16) {
-    device_bin_size_ = 16;  // LGBM_CUDA
+    device_bin_size_ = 16;
     histogram_size_ = 16;
-    dword_features_ = 1;  // LGBM_CUDA
+    dword_features_ = 1;
   } else if (max_num_bin_ <= 64) {
-    device_bin_size_ = 64;  // LGBM_CUDA
+    device_bin_size_ = 64;
     histogram_size_ = 64;
-    dword_features_ = 1;  // LGBM_CUDA
+    dword_features_ = 1;
   } else if (max_num_bin_ <= 256) {
     Log::Debug("device_bin_size_ = 256");
     device_bin_size_ = 256;
     histogram_size_ = 256;
-    dword_features_ = 1;  // LGBM_CUDA
+    dword_features_ = 1;
   } else {
     Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
   }
@@ -555,11 +519,6 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   kernel_wait_obj_.resize(num_gpu_);
   histograms_wait_obj_.resize(num_gpu_);
 
-  // for debuging
-  kernel_time_.resize(num_gpu_, 0);
-  kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
-  // kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0));
-
   for (int i = 0; i < num_gpu_; ++i) {
     CUDASUCCESS_OR_FATAL(cudaSetDevice(i));
     CUDASUCCESS_OR_FATAL(cudaStreamCreate(&(stream_[i])));
@@ -577,7 +536,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
 
   AllocateGPUMemory();
 
-  copyDenseFeature();  // LGBM_CUDA
+  copyDenseFeature();
 }
 
 Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians) {
@@ -591,20 +550,18 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_
 
   SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
 
-  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
+  #if ResetTrainingData_DEBUG == 1
   serial_time = std::chrono::steady_clock::now() - start_serial_time;
   #endif
 
   num_feature_groups_ = train_data_->num_feature_groups();
 
   // GPU memory has to been reallocated because data may have been changed
-
-  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
+  #if ResetTrainingData_DEBUG == 1
   auto start_alloc_gpu_time = std::chrono::steady_clock::now();
   #endif
 
   // LGBM_CUDA: AllocateGPUMemory only when the number of data increased
-
   int old_num_feature_groups = num_dense_feature_groups_;
   CountDenseFeatureGroups();
   if ((old_allocated_num_data < (num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature))) || (old_num_feature_groups < num_dense_feature_groups_)) {
@@ -616,17 +573,16 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_
 
   copyDenseFeature();
 
-  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
+  #if ResetTrainingData_DEBUG == 1
   alloc_gpu_time = std::chrono::steady_clock::now() - start_alloc_gpu_time;
   #endif
 
   // setup GPU kernel arguments after we allocating all the buffers
-
-  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
+  #if ResetTrainingData_DEBUG == 1
   auto start_set_arg_time = std::chrono::steady_clock::now();
   #endif
 
-  #if ResetTrainingData_DEBUG == 1  // LGBM_CUDA
+  #if ResetTrainingData_DEBUG == 1
   set_arg_time = std::chrono::steady_clock::now() - start_set_arg_time;
   reset_training_data_time = std::chrono::steady_clock::now() - start_reset_training_data_time;
   Log::Info("reset_training_data_time: %f secs.", reset_training_data_time.count() * 1e-3);
@@ -637,7 +593,7 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_
 }
 
 void CUDATreeLearner::BeforeTrain() {
-  #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+  #if cudaMemcpy_DEBUG == 1
   std::chrono::duration<double, std::milli> device_hessians_time = std::chrono::milliseconds(0);
   std::chrono::duration<double, std::milli> device_gradients_time = std::chrono::milliseconds(0);
   #endif
@@ -650,7 +606,6 @@ void CUDATreeLearner::BeforeTrain() {
 
   // Copy initial full hessians and gradients to GPU.
   // We start copying as early as possible, instead of at ConstructHistogram().
-
   if ((hessians_ != NULL) && (gradients_ != NULL)) {
     if (!use_bagging_ && num_dense_feature_groups_) {
       Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_);
@@ -659,32 +614,29 @@ void CUDATreeLearner::BeforeTrain() {
         if (!(share_state_->is_constant_hessian)) {
           Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_");
 
-          #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+          #if cudaMemcpy_DEBUG == 1
           auto start_device_hessians_time = std::chrono::steady_clock::now();
           #endif
 
-          // const data_size_t* indices = data_partition_->indices();
-          // data_size_t cnt = data_partition_->leaf_count(0);
-
           CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
 
           CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
 
-          #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+          #if cudaMemcpy_DEBUG == 1
           device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time;
           #endif
 
           Log::Debug("queued copy of device_hessians_");
         }
 
-        #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+        #if cudaMemcpy_DEBUG == 1
         auto start_device_gradients_time = std::chrono::steady_clock::now();
         #endif
 
         CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
         CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
 
-        #if cudaMemcpy_DEBUG == 1  // LGBM_CUDA
+        #if cudaMemcpy_DEBUG == 1
         device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time;
         #endif
 
@@ -693,16 +645,11 @@ void CUDATreeLearner::BeforeTrain() {
     }
   }
 
-#if 0
-  SerialTreeLearner::BeforeTrain();
-#endif
-
   // use bagging
   if ((hessians_ != NULL) && (gradients_ != NULL)) {
     if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
       // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram()
       // copy used gradients and hessians to ordered buffer
-
       const data_size_t* indices = data_partition_->indices();
       data_size_t cnt = data_partition_->leaf_count(0);
 
@@ -747,10 +694,6 @@ bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int r
     data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
     data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
 
-    // copy indices to the GPU:
-    #if GPU_DEBUG >= 2
-    #endif
-
     for (int device_id = 0; device_id < num_gpu_; ++device_id) {
       CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], &indices[begin], (end-begin) * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
       CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
@@ -789,8 +732,8 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
   #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
   for (int i = 0; i < num_features_; ++i) {
     if (is_feature_used[i]) {
-      int feature_group = train_data_->Feature2Group(i);  // LGBM_CUDA
-      is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group) <= 16) ? 2 : 1;  // LGBM_CUDA
+      int feature_group = train_data_->Feature2Group(i);
+      is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group) <= 16) ? 2 : 1;
     }
   }
 
@@ -814,10 +757,8 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
 
   // if not all feature groups are used, we need to transfer the feature mask to GPU
   // otherwise, we will use a specialized GPU kernel with all feature groups enabled
-  // LGBM_CUDA 
 
   // LGBM_CUDA We now copy even if all features are used.
-
   #pragma omp parallel for schedule(static, num_gpu_)
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     int offset = offset_gpu_feature_groups_[device_id];
@@ -825,16 +766,12 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
   }
 
   // All data have been prepared, now run the GPU kernel
-
   GPUHistogram(num_data, use_all_features);
 
   return true;
 }
 
 void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-  // LGBM_CUDA
-  // auto start_time = std::chrono::steady_clock::now();
-
   std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
   std::vector<int8_t> is_dense_feature_used(num_features_, 0);
   int num_dense_features = 0, num_sparse_features = 0;
@@ -958,12 +895,10 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     }
     std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
     delete [] gpu_histogram;
-    // break;  // LGBM_CUDA: see only first feature info
   }
   printf("End Comparing Histogram between GPU and CPU\n");
   fflush(stderr);
   fflush(stdout);
-//  #endif
 #endif
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
@@ -976,7 +911,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 
     // then construct sparse features on CPU
     // We set data_indices to null to avoid rebuilding ordered gradients/hessians
-
     if (num_sparse_features > 0) {
     train_data_->ConstructHistograms(is_sparse_feature_used,
       larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
@@ -987,7 +921,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
     }
 
     // wait for GPU to finish, only if GPU is actually used
-
     if (is_gpu_used) {
       if (config_->gpu_use_dp) {
         // use double precision
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 034a31f6d3a..7385d2af5dc 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -3,8 +3,8 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #pragma once
-#ifndef LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-#define LGBM_TREELEARNET_CUDA_TREE_LEARNER_H_
+#ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
+#define LIGHTGBM_TREELEARNET_CUDA_TREE_LEARNER_H_
 
 #include <cstdio>
 #include <vector>
@@ -64,10 +64,6 @@ class CUDATreeLearner: public SerialTreeLearner {
     void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
 
  private:
-    /*! \brief 4-byte feature tuple used by GPU kernels */
-    // struct Feature4 {
-    //    uint8_t s[4];
-    // };
     typedef float gpu_hist_t;
 
     /*!
@@ -297,4 +293,4 @@ class CUDATreeLearner: public SerialTreeLearner {
 }  // namespace LightGBM
 
 #endif  // USE_CUDA
-#endif  // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
+#endif  // LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
index 86400ae84e6..a1c2744c624 100644
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ b/src/treelearner/kernels/histogram_16_64_256.hu
@@ -12,8 +12,8 @@
  * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  */
 
-#ifndef _HISTOGRAM_16_64_256_KERNEL_
-#define _HISTOGRAM_16_64_256_KERNEL_
+#ifndef LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_
+#define LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_
 
 //#pragma once
 
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 79882ded79e..fc1de33e365 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -11,6 +11,7 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/json11.h>
 #include <LightGBM/utils/random.h>
+#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <string>
 #include <cmath>
@@ -26,11 +27,6 @@
 #include "monotone_constraints.hpp"
 #include "split_info.hpp"
 
-// LGBM_CUDA
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#endif
-
 #ifdef USE_GPU
 // Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled.
 // This is necessary to pin the two arrays in memory and make transferring faster.
@@ -206,7 +202,7 @@ class SerialTreeLearner: public TreeLearner {
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
-#elif USE_CUDA  // LGBM_CUDA
+#elif USE_CUDA
   /*! \brief gradients of current iteration, ordered for cache optimized */
   std::vector<score_t, CHAllocator<score_t>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized */

From 6fee44a9dfa63f24ca2c7db0fa967e0217f6951f Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 13 Jul 2020 20:39:11 +0000
Subject: [PATCH 090/119] More reviewers comments cleanup.

---
 src/treelearner/cuda_kernel_launcher.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h
index 57c5f1bfc26..6b6b7cb4b01 100644
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
@@ -2,11 +2,10 @@
  * Copyright (c) 2020 IBM Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#ifndef LGBM_KERNEL_LAUNCHER
-#define LGBM_KERNEL_LAUNCHER
+#ifndef LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
+#define LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
 
 #ifdef USE_CUDA
-// what should I include??
 #include <chrono>
 #include "kernels/histogram_16_64_256.hu"  // kernel, acc_type, data_size_t, uchar, score_t
 
@@ -67,4 +66,4 @@ void cuda_histogram(
 
 
 #endif  // USE_CUDA
-#endif  // LGBM_KERNEL_LAUNCHER
+#endif  // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_

From cc41446a8eccdf496a41904a2ea11839f6e10452 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 14 Jul 2020 13:19:14 +0000
Subject: [PATCH 091/119] More reviewers comments cleanup.

---
 src/treelearner/cuda_tree_learner.cpp          |  7 -------
 src/treelearner/cuda_tree_learner.h            |  2 --
 src/treelearner/kernels/histogram_16_64_256.cu | 13 ++-----------
 src/treelearner/kernels/histogram_16_64_256.hu | 13 ++-----------
 4 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 4183019ed85..7a0a5d59093 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -163,7 +163,6 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
 int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
   // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
   // also guarantee that there are at least 2K examples per workgroup
-
   double x = 256.0 / num_dense_feature_groups_;
 
   int exp_workgroups_per_feature = static_cast<int>(ceil(log2(x)));
@@ -186,7 +185,6 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
   // decide the best number of workgroups working on one feature4 tuple
   // set work group size based on feature size
   // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
-
   int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
   std::vector<int> num_gpu_workgroups;
   ThreadData *thread_data = reinterpret_cast<ThreadData*>(malloc(sizeof(ThreadData) * num_gpu_));
@@ -213,7 +211,6 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
   }
 
   /* Wait for the threads to finish */
-
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     if (pthread_join(*(cpu_threads_[device_id]), NULL)) {
       fprintf(stderr, "Error in joining threads. Exiting\n");
@@ -439,7 +436,6 @@ void CUDATreeLearner::copyDenseFeature() {
 // LGBM_CUDA: InitGPU w/ num_gpu
 void CUDATreeLearner::InitGPU(int num_gpu) {
   // Get the max bin size, used for selecting best GPU kernel
-
   max_num_bin_ = 0;
 
   #if GPU_DEBUG >= 1
@@ -676,7 +672,6 @@ bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int r
   data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
   data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
 
-
   // only have root
   if (right_leaf < 0) {
     smaller_leaf = -1;
@@ -742,7 +737,6 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
   #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (is_feature_group_used[dense_feature_group_map_[i]]) {
-      // feature_masks_[i] = 1;
       feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]];
       ++used_dense_feature_groups;
     } else {
@@ -903,7 +897,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
     // construct larger leaf
-
     hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
 
     is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 7385d2af5dc..ca063765b19 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -175,8 +175,6 @@ class CUDATreeLearner: public SerialTreeLearner {
     /*! \brief True if bagging is used */
     bool use_bagging_;
 
-    /*! \brief GPU device object */
-    // int* dev_;
     /*! \brief GPU command queue object */
     std::vector<cudaStream_t> stream_;
 
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 994a3c94170..5a7fe5245b6 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -1,15 +1,6 @@
 /*!
- * ibmGBT: IBM CUDA Accelerated LightGBM
- *
- * IBM Confidential
- * Copyright (c) 2019 IBM Corporation. All rights reserved.
- *
- * The source code for this program is not published or otherwise
- * divested of its trade secrets, irrespective of what has been
- * deposited with the U.S. Copyright Office.
- *
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
+ * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
 #include "histogram_16_64_256.hu"
diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
index a1c2744c624..e228d3b0068 100644
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ b/src/treelearner/kernels/histogram_16_64_256.hu
@@ -1,15 +1,6 @@
 /*!
- * ibmGBT: IBM CUDA Accelerated LightGBM
- *
- * IBM Confidential
- * Copyright (c) 2019 IBM Corporation. All rights reserved.
- *
- * The source code for this program is not published or otherwise
- * divested of its trade secrets, irrespective of what has been
- * deposited with the U.S. Copyright Office.
- *
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
+ * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
 #ifndef LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_

From bab89cfbde0ffcf13992874f79705d141efe30df Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 14 Jul 2020 17:30:34 +0000
Subject: [PATCH 092/119] Fix config variables.

---
 include/LightGBM/config.h           | 1 +
 src/io/config.cpp                   | 5 +++++
 src/io/config_auto.cpp              | 4 +---
 src/treelearner/cuda_tree_learner.h | 1 -
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 36219675da5..fff98c437f1 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -954,6 +954,7 @@ struct Config {
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   bool gpu_use_dp = false;
 
+  // check = >0
   // desc = number of gpus (CUDA implementation only)
   // desc = default value is 1
   int num_gpu = 1;
diff --git a/src/io/config.cpp b/src/io/config.cpp
index b354bf10f03..4c65d158800 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -333,6 +333,11 @@ void Config::CheckParamConflict() {
     force_row_wise = false;
   }
 
+  // force gpu_use_dp for CUDA
+  if (device_type == std::string("cuda")) {
+    gpu_use_dp = true;
+  }
+
   // min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated
   // the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can
   // be 1 when there is actually no data in the leaf. In rare cases this can cause a bug because with path smoothing the
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index d0bb97e1942..b35e8da49ab 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -606,9 +606,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
   GetInt(params, "gpu_device_id", &gpu_device_id);
 
   GetBool(params, "gpu_use_dp", &gpu_use_dp);
-#ifdef USE_CUDA
-  gpu_use_dp = true;  /* LGBM_CUDA hard-coding gpu_use_dp to TRUE (default is false) */
-#endif
 
   GetInt(params, "num_gpu", &num_gpu);
   CHECK_GT(num_gpu, 0);
@@ -715,6 +712,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[gpu_platform_id: " << gpu_platform_id << "]\n";
   str_buf << "[gpu_device_id: " << gpu_device_id << "]\n";
   str_buf << "[gpu_use_dp: " << gpu_use_dp << "]\n";
+  str_buf << "[num_gpu: " << num_gpu << "]\n";
   return str_buf.str();
 }
 
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index ca063765b19..cc6ad806f85 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -169,7 +169,6 @@ class CUDATreeLearner: public SerialTreeLearner {
     const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10
     /*! brief Max total number of workgroups with preallocated workspace.
      *        If we use more than this number of workgroups, we have to reallocate subhistograms */
-    // int preallocd_max_num_wg_ = 1024;
     std::vector<int> preallocd_max_num_wg_;
 
     /*! \brief True if bagging is used */

From ea96902b8dccbca957c8b53a9cdead1a1c6e5ae0 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 17 Jul 2020 14:53:30 +0000
Subject: [PATCH 093/119] Attempt to fix check-docs failure

---
 include/LightGBM/config.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index fff98c437f1..c24e9613a95 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -956,7 +956,6 @@ struct Config {
 
   // check = >0
   // desc = number of gpus (CUDA implementation only)
-  // desc = default value is 1
   int num_gpu = 1;
 
   #pragma endregion

From 12a9fe50c0156e37fee7a9bb6f23a6883b12649e Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 17 Jul 2020 15:39:03 +0000
Subject: [PATCH 094/119] Update Paramster.rst for num_gpu

---
 docs/Parameters.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 01362fb9af3..5bd392d82f8 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -1104,6 +1104,10 @@ GPU Parameters
 
    -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
 
+-  ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
+
+   -  number of gpus (CUDA implementation only)
+
 .. end params list
 
 Others

From d712538d59880502922a939c6cf9187134b8e018 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 20 Jul 2020 12:00:20 +0000
Subject: [PATCH 095/119] Removing test appveyor.yml

---
 appveyor.yml | 50 --------------------------------------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 appveyor.yml

diff --git a/appveyor.yml b/appveyor.yml
deleted file mode 100644
index b4c0131a9af..00000000000
--- a/appveyor.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-version: 2.3.2.{build}
-
-image: Visual Studio 2015
-platform: x64
-configuration:  # a trick to construct a build matrix with multiple Python versions
-  - 3.7
-
-# only build pull requests and
-# commits to 'cuda'
-branches:
-  only:
-    - cuda
-
-environment:
-  matrix:
-    - COMPILER: MSVC
-      TASK: python
-    - COMPILER: MINGW
-      TASK: python
-
-clone_depth: 5
-
-init:
-  - ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1'))
-
-install:
-  - git submodule update --init --recursive  # get `compute` folder
-  - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%  # delete sh.exe from PATH (mingw32-make fix)
-  - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH%
-  - set PYTHON_VERSION=%CONFIGURATION%
-  - set CONDA_ENV="test-env"
-  - ps: >-
-      switch ($env:PYTHON_VERSION) {
-          "2.7" {$env:MINICONDA = "C:\Miniconda-x64"}
-          "3.5" {$env:MINICONDA = "C:\Miniconda35-x64"}
-          "3.6" {$env:MINICONDA = "C:\Miniconda36-x64"}
-          "3.7" {$env:MINICONDA = "C:\Miniconda37-x64"}
-          default {$env:MINICONDA = "C:\Miniconda37-x64"}
-      }
-      $env:PATH="$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH"
-  - ps: $env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim()
-
-build: false
-
-test_script:
-  - conda init powershell
-  - powershell.exe -ExecutionPolicy Bypass -File %APPVEYOR_BUILD_FOLDER%\.ci\test_windows.ps1
-
-on_finish:
-  - ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1'))

From 26c4dce683c773abc8749b2978cbc3c9d1f03e3c Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 20 Jul 2020 22:10:36 +0000
Subject: [PATCH 096/119] =?UTF-8?q?Add=20=C2=83CUDA=5FRESOLVE=5FDEVICE=5FS?=
 =?UTF-8?q?YMBOLS=20to=20libraries=20to=20fix=20linking=20issue.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aae29463f94..f5ca69b4c9c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -381,10 +381,12 @@ if(USE_GPU)
 endif(USE_GPU)
 
 if(USE_CUDA)
+  set_property(TARGET lightgbm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
   TARGET_LINK_LIBRARIES(
     lightgbm
     ${histograms}
   )
+  set_property(TARGET _lightgbm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
   TARGET_LINK_LIBRARIES(
     _lightgbm
     ${histograms}

From 70b4bbb9c561077ac3f0fb6d6a09ac22350ff0c3 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 21 Jul 2020 16:29:44 +0000
Subject: [PATCH 097/119] Fixed handling of data elements less than 2K.

---
 src/application/application.cpp       |  3 ---
 src/c_api.cpp                         | 10 ----------
 src/treelearner/cuda_tree_learner.cpp | 10 +++++-----
 3 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/src/application/application.cpp b/src/application/application.cpp
index e88f5c86188..2d3facdb978 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -43,10 +43,7 @@ Application::Application(int argc, char** argv) {
 #ifdef USE_CUDA
   if (config_.device_type == std::string("cuda")) {
       LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
-
       config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config_.bagging_fraction == 1.0) { config_.bagging_fraction = 0.8; }
-      if (config_.bagging_freq == 0) { config_.bagging_freq = 1; }
   }
 #endif
 }
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 050e605b268..caa9ed577fd 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -44,10 +44,7 @@ inline void AdditionalConfig(Config *config) {
 #ifdef USE_CUDA
   if (config->device_type == std::string("cuda")) {
       LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
-
       config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-      if (config->bagging_fraction == 1.0) { config->bagging_fraction = 0.8; }
-      if (config->bagging_freq == 0) { config->bagging_freq = 1; }
   }
 #else
   (void)(config);       // UNUSED
@@ -126,13 +123,6 @@ class Booster {
       omp_set_num_threads(config_.num_threads);
     }
 
-#ifdef USE_CUDA
-    // Only use CUDA when the data is large (2048 == 256 bins each with at least 8 elements)
-    if (train_data->num_data() < 2048) {
-       config_.device_type = std::string("cpu");
-    }
-#endif
-
     AdditionalConfig(&config_);
 
     // create boosting
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 7a0a5d59093..9bc140f1021 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -104,7 +104,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
   printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, static_cast<int>(size));
   if (dp_flag) {  // double precision
     double af, bf;
-    int64 ai, bi;
+    int64_t ai, bi;
     for (i = 0; i < static_cast<int>(size); ++i) {
       af = GET_GRAD(h1, i);
       bf = GET_GRAD(h2, i);
@@ -113,10 +113,10 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
         ++retval;
       }
       if (const_flag) {
-        ai = GET_HESS((reinterpret_cast<int64 *>(h1), i);
-        bi = GET_HESS((reinterpret_cast<int64 *>(h2), i);
+        ai = GET_HESS((reinterpret_cast<int64_t *>(h1)), i);
+        bi = GET_HESS((reinterpret_cast<int64_t *>(h2)), i);
         if (ai != bi) {
-          printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi);
+          printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, (long long int) ai, (long long int) bi);
           ++retval;
         }
       } else {
@@ -743,7 +743,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
       feature_masks_[i] = 0;
     }
   }
-  bool use_all_features = used_dense_feature_groups == num_dense_feature_groups_;
+  bool use_all_features = ((used_dense_feature_groups == num_dense_feature_groups_) && (data_indices != nullptr));
   // if no feature group is used, just return and do not use GPU
   if (used_dense_feature_groups == 0) {
     return false;

From e7f45f5ce0582454a2ed58a0dfcc35dc652a97fd Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 21 Jul 2020 17:15:27 +0000
Subject: [PATCH 098/119] More reviewers comments cleanup.

---
 .../kernels/histogram_16_64_256.cu            | 39 -------------------
 1 file changed, 39 deletions(-)

diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 5a7fe5245b6..e6fceeb8cd4 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -294,19 +294,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     __syncthreads();
     __threadfence();
-    // To avoid the cost of an extra reducting kernel, we have to deal with some
-    // gray area in OpenCL. We want the last work group that process this feature to
-    // make the final reduction, and other threads will just quit.
-    // This requires that the results written by other workgroups available to the
-    // last workgroup (memory consistency)
-    // this is equavalent to CUDA __threadfence();
-    // ensure the writes above goes to main memory and other workgroups can see it
-    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
-    // Now, we want one workgroup to do the final reduction.
-    // Other workgroups processing the same feature quit.
-    // The is done by using an global atomic counter.
-    // On AMD GPUs ideally this should be done in GDS,
-    // but currently there is no easy way to access it via OpenCL.
     uint * counter_val = cnt_hist;
     // backup the old value
     uint old_val = *counter_val;
@@ -624,19 +611,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     __syncthreads();
     __threadfence();
-    // To avoid the cost of an extra reducting kernel, we have to deal with some
-    // gray area in OpenCL. We want the last work group that process this feature to
-    // make the final reduction, and other threads will just quit.
-    // This requires that the results written by other workgroups available to the
-    // last workgroup (memory consistency)
-    // this is equavalent to CUDA __threadfence();
-    // ensure the writes above goes to main memory and other workgroups can see it
-    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
-    // Now, we want one workgroup to do the final reduction.
-    // Other workgroups processing the same feature quit.
-    // The is done by using an global atomic counter.
-    // On AMD GPUs ideally this should be done in GDS,
-    // but currently there is no easy way to access it via OpenCL.
     uint * counter_val = cnt_hist;
     // backup the old value
     uint old_val = *counter_val;
@@ -955,19 +929,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     __syncthreads();
     __threadfence();
-    // To avoid the cost of an extra reducting kernel, we have to deal with some
-    // gray area in OpenCL. We want the last work group that process this feature to
-    // make the final reduction, and other threads will just quit.
-    // This requires that the results written by other workgroups available to the
-    // last workgroup (memory consistency)
-    // this is equavalent to CUDA __threadfence();
-    // ensure the writes above goes to main memory and other workgroups can see it
-    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
-    // Now, we want one workgroup to do the final reduction.
-    // Other workgroups processing the same feature quit.
-    // The is done by using an global atomic counter.
-    // On AMD GPUs ideally this should be done in GDS,
-    // but currently there is no easy way to access it via OpenCL.
     uint * counter_val = cnt_hist;
     // backup the old value
     uint old_val = *counter_val;

From 282731cbc7497da848ba3cc70197a08c482abbf8 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 22 Jul 2020 12:47:20 +0000
Subject: [PATCH 099/119] Removal of TODO and fix printing of int64_t

---
 src/treelearner/cuda_tree_learner.cpp          | 3 ++-
 src/treelearner/kernels/histogram_16_64_256.cu | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 9bc140f1021..f4387f7c6a8 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -15,6 +15,7 @@
 
 #include <algorithm>
 #include <vector>
+#include <cinttypes>
 
 #include "../io/dense_bin.hpp"
 
@@ -116,7 +117,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d
         ai = GET_HESS((reinterpret_cast<int64_t *>(h1)), i);
         bi = GET_HESS((reinterpret_cast<int64_t *>(h2)), i);
         if (ai != bi) {
-          printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, (long long int) ai, (long long int) bi);
+          printf("i = %5d, h1.hess %" PRId64 ", h2.hess %" PRId64 "\n", i, ai, bi);
           ++retval;
         }
       } else {
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index e6fceeb8cd4..09d563cbaf4 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -41,7 +41,6 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
     const ushort ltid = threadIdx.x;
-    // TODO(anyone): try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
     uint* __restrict__ local_cnt = reinterpret_cast<uint *>(local_hist + 2 * NUM_BINS);
@@ -360,7 +359,6 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
     const ushort ltid = threadIdx.x;
-    // TODO(anyone): try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
     uint* __restrict__ local_cnt = reinterpret_cast<uint *>(local_hist + 2 * NUM_BINS);
@@ -677,7 +675,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
     const ushort ltid = threadIdx.x;
-    // TODO(anyone): try to avoid bank conflict here
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
     uint* __restrict__ local_cnt = reinterpret_cast<uint *>(local_hist + 2 * NUM_BINS);

From 6103a87817cb4e671a22feb6c8eb315ae8a2df76 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 22 Jul 2020 16:07:46 +0000
Subject: [PATCH 100/119] Add cuda change for CI testing and remove cuda from
 device_type in python.

---
 .ci/test.sh                                   | 10 +++
 include/LightGBM/c_api.h                      |  6 --
 python-package/lightgbm/__init__.py           |  3 +-
 python-package/lightgbm/basic.py              |  5 --
 src/c_api.cpp                                 | 10 ---
 tests/python_package_test/test_basic.py       |  2 -
 tests/python_package_test/test_consistency.py |  4 -
 tests/python_package_test/test_engine.py      | 85 -------------------
 tests/python_package_test/test_plotting.py    |  2 -
 tests/python_package_test/test_sklearn.py     |  6 --
 10 files changed, 11 insertions(+), 122 deletions(-)

diff --git a/.ci/test.sh b/.ci/test.sh
index 7c68ca733fe..c12e019ccb7 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -132,6 +132,16 @@ if [[ $TASK == "gpu" ]]; then
         exit 0
     fi
     cmake -DUSE_GPU=ON -DOpenCL_INCLUDE_DIR=$AMDAPPSDK_PATH/include/ ..
+elif [[ $TASK == "cuda" ]]; then
+    sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
+    grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
+    if [[ $METHOD == "pip" ]]; then
+        cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
+        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
+        pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
+        exit 0
+    fi
+    cmake -DUSE_CUDA=ON ..
 elif [[ $TASK == "mpi" ]]; then
     if [[ $METHOD == "pip" ]]; then
         cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
index 3fbccdac075..9d7c6e61dd2 100644
--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -1076,12 +1076,6 @@ LIGHTGBM_C_EXPORT int LGBM_NetworkInitWithFunctions(int num_machines,
 #define THREAD_LOCAL thread_local  /*!< \brief Thread local specifier. */
 #endif
 
-/*!
- *  * \brief Returns device type.
- *   * \return 0 = CPU, 1 = GPU / OCL, 2 = CUDA
- *    */
-LIGHTGBM_C_EXPORT int LGBM_GetDeviceType();
-
 /*!
  * \brief Handle of error message.
  * \return Error message
diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py
index 44a56ae03f5..390a6994a7a 100644
--- a/python-package/lightgbm/__init__.py
+++ b/python-package/lightgbm/__init__.py
@@ -5,7 +5,7 @@
 """
 from __future__ import absolute_import
 
-from .basic import Booster, Dataset, get_device_type
+from .basic import Booster, Dataset
 from .callback import (early_stopping, print_evaluation, record_evaluation,
                        reset_parameter)
 from .engine import cv, train
@@ -30,7 +30,6 @@
         __version__ = version_file.read().strip()
 
 __all__ = ['Dataset', 'Booster',
-           'get_device_type',
            'train', 'cv',
            'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
            'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index 9dace6b768c..01a5f31e51b 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -432,11 +432,6 @@ def _load_pandas_categorical(file_name=None, model_str=None):
         return None
 
 
-def get_device_type():
-    """Get device type."""
-    return _LIB.LGBM_GetDeviceType()
-
-
 class _InnerPredictor(object):
     """_InnerPredictor of LightGBM.
 
diff --git a/src/c_api.cpp b/src/c_api.cpp
index caa9ed577fd..84d0a25ab08 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -627,16 +627,6 @@ const char* LGBM_GetLastError() {
   return LastErrorMsg();
 }
 
-int LGBM_GetDeviceType() {
-#ifdef USE_GPU
-  return 1;
-#elif USE_CUDA
-  return 2;
-#else
-  return 0;     // CPU
-#endif
-}
-
 int LGBM_RegisterLogCallback(void (*callback)(const char*)) {
   API_BEGIN();
   Log::ResetCallBack(callback);
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index d984c25f65f..85e9e728d70 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -29,8 +29,6 @@ def test(self):
             "max_bin": 255,
             "gpu_use_dp": True
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         bst = lgb.Booster(params, train_data)
         bst.add_valid(valid_data, "valid_1")
 
diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py
index f6e955ee48d..63a5834cf61 100644
--- a/tests/python_package_test/test_consistency.py
+++ b/tests/python_package_test/test_consistency.py
@@ -68,8 +68,6 @@ class TestEngine(unittest.TestCase):
 
     def test_binary(self):
         fd = FileLoader('../../examples/binary_classification', 'binary')
-        if lgb.get_device_type() == 2:
-            fd.params["device"] = "cuda"
         X_train, y_train, _ = fd.load_dataset('.train')
         X_test, _, X_test_fn = fd.load_dataset('.test')
         weight_train = fd.load_field('.train.weight')
@@ -93,8 +91,6 @@ def test_multiclass(self):
 
     def test_regression(self):
         fd = FileLoader('../../examples/regression', 'regression')
-        if lgb.get_device_type() == 2:
-            fd.params["device"] = "cuda"
         X_train, y_train, _ = fd.load_dataset('.train')
         X_test, _, X_test_fn = fd.load_dataset('.test')
         init_score_train = fd.load_field('.train.init')
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index b5de6a9a4c7..286bd2e2a8d 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -61,8 +61,6 @@ def test_binary(self):
             'verbose': -1,
             'num_iteration': 50  # test num_iteration in dict here
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
         evals_result = {}
@@ -89,8 +87,6 @@ def test_rf(self):
             'metric': 'binary_logloss',
             'verbose': -1
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
         evals_result = {}
@@ -110,8 +106,6 @@ def test_regression(self):
             'metric': 'l2',
             'verbose': -1
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
         evals_result = {}
@@ -139,8 +133,6 @@ def test_missing_value_handle(self):
             'verbose': -1,
             'boost_from_average': False
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=20,
@@ -196,8 +188,6 @@ def test_missing_value_handle_na(self):
             'min_data_in_bin': 1,
             'zero_as_missing': False
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -230,8 +220,6 @@ def test_missing_value_handle_zero(self):
             'min_data_in_bin': 1,
             'zero_as_missing': True
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -264,8 +252,6 @@ def test_missing_value_handle_none(self):
             'min_data_in_bin': 1,
             'use_missing': False
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -304,8 +290,6 @@ def test_categorical_handle(self):
             'zero_as_missing': True,
             'categorical_column': 0
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -343,8 +327,6 @@ def test_categorical_handle_na(self):
             'zero_as_missing': False,
             'categorical_column': 0
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=1,
@@ -403,8 +385,6 @@ def test_multiclass(self):
             'num_class': 10,
             'verbose': -1
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, params=params)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
         evals_result = {}
@@ -421,7 +401,6 @@ def test_multiclass_rf(self):
         X, y = load_digits(10, True)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
         params = {
-            'device': 'cpu',
             'boosting_type': 'rf',
             'objective': 'multiclass',
             'metric': 'multi_logloss',
@@ -455,8 +434,6 @@ def test_multiclass_prediction_early_stopping(self):
             'num_class': 10,
             'verbose': -1
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, params=params)
         gbm = lgb.train(params, lgb_train,
                         num_boost_round=50)
@@ -478,8 +455,6 @@ def test_multi_class_error(self):
         X, y = load_digits(10, True)
         params = {'objective': 'multiclass', 'num_classes': 10, 'metric': 'multi_error',
                   'num_leaves': 4, 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=10)
         predict_default = est.predict(X)
@@ -589,8 +564,6 @@ def test_early_stopping(self):
             'metric': 'binary_logloss',
             'verbose': -1
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
@@ -624,8 +597,6 @@ def test_continue_train(self):
             'metric': 'l1',
             'verbose': -1
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)
         init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
@@ -691,8 +662,6 @@ def test_continue_train_multiclass(self):
             'num_class': 3,
             'verbose': -1
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, params=params, free_raw_data=False)
         lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params, free_raw_data=False)
         init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
@@ -749,8 +718,6 @@ def test_cv(self):
         q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                           '../../examples/lambdarank/rank.train.query'))
         params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3}
-        if lgb.get_device_type() == 2:
-            params_lambdarank["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
         # ... with l2 metric
         cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
@@ -804,8 +771,6 @@ def train_and_predict(init_model=None, return_model=False):
                 'metric': 'l2',
                 'verbose': -1
             }
-            if lgb.get_device_type() == 2:
-                params["device"] = "cuda"
             lgb_train = lgb.Dataset(X_train, y_train)
             gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model)
             return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test))
@@ -859,8 +824,6 @@ def test_pandas_categorical(self):
             'metric': 'binary_logloss',
             'verbose': -1
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X, y)
         gbm0 = lgb.train(params, lgb_train, num_boost_round=10)
         pred0 = gbm0.predict(X_test)
@@ -957,8 +920,6 @@ def test_reference_chain(self):
         tmp_dat_train = tmp_dat.subset(np.arange(80))
         tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18))
         params = {'objective': 'regression_l2', 'metric': 'rmse'}
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         evals_result = {}
         gbm = lgb.train(params, tmp_dat_train, num_boost_round=20,
                         valid_sets=[tmp_dat_train, tmp_dat_val],
@@ -974,8 +935,6 @@ def test_contribs(self):
             'metric': 'binary_logloss',
             'verbose': -1,
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X_train, y_train)
         gbm = lgb.train(params, lgb_train, num_boost_round=20)
 
@@ -990,8 +949,6 @@ def train_and_get_predictions(features, labels):
                 'verbose': -1,
                 'min_data': 5,
             }
-            if lgb.get_device_type() == 2:
-                lgb_params["device"] = "cuda"
             gbm = lgb.train(
                 params=lgb_params,
                 train_set=dataset,
@@ -1282,8 +1239,6 @@ def test_mape_rf(self):
             'feature_fraction': 0.8,
             'boost_from_average': True
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X, y)
         gbm = lgb.train(params, lgb_train, num_boost_round=20)
         pred = gbm.predict(X)
@@ -1301,8 +1256,6 @@ def test_mape_dart(self):
             'feature_fraction': 0.8,
             'boost_from_average': False
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         lgb_train = lgb.Dataset(X, y)
         gbm = lgb.train(params, lgb_train, num_boost_round=40)
         pred = gbm.predict(X)
@@ -1322,8 +1275,6 @@ def check_constant_features(self, y_true, expected_pred, more_params):
             'min_data_in_bin': 1,
             'boost_from_average': True
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         params.update(more_params)
         lgb_train = lgb.Dataset(X_train, y_train, params=params)
         gbm = lgb.train(params, lgb_train, num_boost_round=2)
@@ -1334,8 +1285,6 @@ def test_constant_features_regression(self):
         params = {
             'objective': 'regression'
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         self.check_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params)
         self.check_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params)
         self.check_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params)
@@ -1344,8 +1293,6 @@ def test_constant_features_binary(self):
         params = {
             'objective': 'binary'
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         self.check_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params)
         self.check_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params)
 
@@ -1354,8 +1301,6 @@ def test_constant_features_multiclass(self):
             'objective': 'multiclass',
             'num_class': 3
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
         self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
 
@@ -1364,8 +1309,6 @@ def test_constant_features_multiclassova(self):
             'objective': 'multiclassova',
             'num_class': 3
         }
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
         self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
 
@@ -1385,8 +1328,6 @@ def preprocess_data(dtrain, dtest, params):
         X, y = load_iris(True)
         dataset = lgb.Dataset(X, y, free_raw_data=False)
         params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data)
         self.assertIn('multi_logloss-mean', results)
         self.assertEqual(len(results['multi_logloss-mean']), 10)
@@ -1399,28 +1340,14 @@ def test_metrics(self):
 
         evals_result = {}
         params_verbose = {'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params_verbose["device"] = "cuda"
         params_obj_verbose = {'objective': 'binary', 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params_obj_verbose["device"] = "cuda"
         params_obj_metric_log_verbose = {'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params_obj_metric_log_verbose["device"] = "cuda"
         params_obj_metric_err_verbose = {'objective': 'binary', 'metric': 'binary_error', 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params_obj_metric_err_verbose["device"] = "cuda"
         params_obj_metric_inv_verbose = {'objective': 'binary', 'metric': 'invalid_metric', 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params_obj_metric_inv_verbose["device"] = "cuda"
         params_obj_metric_multi_verbose = {'objective': 'binary',
                                            'metric': ['binary_logloss', 'binary_error'],
                                            'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params_obj_metric_multi_verbose["device"] = "cuda"
         params_obj_metric_none_verbose = {'objective': 'binary', 'metric': 'None', 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params_obj_metric_none_verbose["device"] = "cuda"
         params_metric_log_verbose = {'metric': 'binary_logloss', 'verbose': -1}
         params_metric_err_verbose = {'metric': 'binary_error', 'verbose': -1}
         params_metric_inv_verbose = {'metric_types': 'invalid_metric', 'verbose': -1}
@@ -1638,8 +1565,6 @@ def train_booster(params=params_obj_verbose, **kwargs):
         # remove default metric by 'None' aliases
         for na_alias in ('None', 'na', 'null', 'custom'):
             params = {'objective': 'binary', 'metric': na_alias, 'verbose': -1}
-            if lgb.get_device_type() == 2:
-                params["device"] = "cuda"
             train_booster(params=params)
             self.assertEqual(len(evals_result), 0)
 
@@ -1720,14 +1645,8 @@ def train_booster(params=params_obj_verbose, **kwargs):
         obj_multi_aliases = ['multiclass', 'softmax', 'multiclassova', 'multiclass_ova', 'ova', 'ovr']
         for obj_multi_alias in obj_multi_aliases:
             params_obj_class_3_verbose = {'objective': obj_multi_alias, 'num_class': 3, 'verbose': -1}
-            if lgb.get_device_type() == 2:
-                params_obj_class_3_verbose["device"] = "cuda"
             params_obj_class_1_verbose = {'objective': obj_multi_alias, 'num_class': 1, 'verbose': -1}
-            if lgb.get_device_type() == 2:
-                params_obj_class_1_verbose["device"] = "cuda"
             params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1}
-            if lgb.get_device_type() == 2:
-                params_obj_verbose["device"] = "cuda"
             # multiclass default metric
             res = get_cv_result(params_obj_class_3_verbose)
             self.assertEqual(len(res), 2)
@@ -1768,8 +1687,6 @@ def train_booster(params=params_obj_verbose, **kwargs):
             self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
                               params_obj_class_3_verbose, metrics='binary_logloss')
         params_class_3_verbose = {'num_class': 3, 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params_class_3_verbose["device"] = "cuda"
         # non-default num_class for default objective
         self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
                           params_class_3_verbose)
@@ -1904,8 +1821,6 @@ def metrics_combination_train_regression(valid_sets, metric_list, assumed_iterat
                 'verbose': -1,
                 'seed': 123
             }
-            if lgb.get_device_type() == 2:
-                params["device"] = "cuda"
             gbm = lgb.train(dict(params, first_metric_only=first_metric_only), lgb_train,
                             num_boost_round=25, valid_sets=valid_sets, feval=feval,
                             early_stopping_rounds=5, verbose_eval=False)
diff --git a/tests/python_package_test/test_plotting.py b/tests/python_package_test/test_plotting.py
index 13ba9859d97..72915914fe1 100644
--- a/tests/python_package_test/test_plotting.py
+++ b/tests/python_package_test/test_plotting.py
@@ -24,8 +24,6 @@ def setUp(self):
             "verbose": -1,
             "num_leaves": 3
         }
-        if lgb.get_device_type() == 2:
-            self.params["device"] = "cuda"
 
     @unittest.skipIf(not MATPLOTLIB_INSTALLED, 'matplotlib is not installed')
     def test_plot_importance(self):
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 350f3c8f486..cd50805a70b 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -453,8 +453,6 @@ def test_evaluate_train_set(self):
     def test_metrics(self):
         X, y = load_boston(True)
         params = {'n_estimators': 2, 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False}
 
         # no custom objective, no custom metric
@@ -711,8 +709,6 @@ def test_inf_handle(self):
         y = np.random.randn(nrows) + np.full(nrows, 1e30)
         weight = np.full(nrows, 1e10)
         params = {'n_estimators': 20, 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y),
                       'verbose': False, 'early_stopping_rounds': 5}
         gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
@@ -725,8 +721,6 @@ def test_nan_handle(self):
         y = np.random.randn(nrows) + np.full(nrows, 1e30)
         weight = np.zeros(nrows)
         params = {'n_estimators': 20, 'verbose': -1}
-        if lgb.get_device_type() == 2:
-            params["device"] = "cuda"
         params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y),
                       'verbose': False, 'early_stopping_rounds': 5}
         gbm = lgb.LGBMRegressor(**params).fit(**params_fit)

From 40e37e872d7c953081eea4aac51834fd0c2ed0a3 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 22 Jul 2020 16:15:06 +0000
Subject: [PATCH 101/119] Missed one change form previous check-in

---
 tests/python_package_test/test_engine.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 286bd2e2a8d..dc48fc9d3a3 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1215,7 +1215,6 @@ def test_refit(self):
         X, y = load_breast_cancer(True)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
         params = {
-            'device': 'cpu',
             'objective': 'binary',
             'metric': 'binary_logloss',
             'verbose': -1,

From 8878ea47f875d7b7f7b83a732f07bb9c3a08b1e8 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 27 Jul 2020 11:44:12 +0000
Subject: [PATCH 102/119] Removal AdditionConfig and fix settings.

---
 src/application/application.cpp |  1 -
 src/c_api.cpp                   | 44 ---------------------------------
 src/io/config.cpp               |  7 ++++++
 src/io/dataset.cpp              |  4 ++-
 4 files changed, 10 insertions(+), 46 deletions(-)

diff --git a/src/application/application.cpp b/src/application/application.cpp
index 2d3facdb978..4d1e8b1866b 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -43,7 +43,6 @@ Application::Application(int argc, char** argv) {
 #ifdef USE_CUDA
   if (config_.device_type == std::string("cuda")) {
       LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
-      config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
   }
 #endif
 }
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 84d0a25ab08..7a36737352a 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -17,7 +17,6 @@
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <string>
 #include <cstdio>
@@ -40,17 +39,6 @@ inline int LGBM_APIHandleException(const std::string& ex) {
   return -1;
 }
 
-inline void AdditionalConfig(Config *config) {
-#ifdef USE_CUDA
-  if (config->device_type == std::string("cuda")) {
-      LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
-      config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */
-  }
-#else
-  (void)(config);       // UNUSED
-#endif
-}
-
 #define API_BEGIN() try {
 #define API_END() } \
 catch(std::exception& ex) { return LGBM_APIHandleException(ex); } \
@@ -123,8 +111,6 @@ class Booster {
       omp_set_num_threads(config_.num_threads);
     }
 
-    AdditionalConfig(&config_);
-
     // create boosting
     if (config_.input_model.size() > 0) {
       Log::Warning("Continued train from model is not supported for c_api,\n"
@@ -318,8 +304,6 @@ class Booster {
       omp_set_num_threads(config_.num_threads);
     }
 
-    AdditionalConfig(&config_);
-
     if (param.count("objective")) {
       // create objective function
       objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective,
@@ -645,8 +629,6 @@ int LGBM_DatasetCreateFromFile(const char* filename,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   DatasetLoader loader(config, nullptr, 1, filename);
   if (reference == nullptr) {
     if (Network::num_machines() == 1) {
@@ -678,8 +660,6 @@ int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   DatasetLoader loader(config, nullptr, 1, nullptr);
   *out = loader.CostructFromSampleData(sample_data, sample_indices, ncol, num_per_col,
                                        num_sample_row,
@@ -792,8 +772,6 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   std::unique_ptr<Dataset> ret;
   int32_t total_nrow = 0;
   for (int j = 0; j < nmat; ++j) {
@@ -886,8 +864,6 @@ int LGBM_DatasetCreateFromCSR(const void* indptr,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   std::unique_ptr<Dataset> ret;
   auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   int32_t nrow = static_cast<int32_t>(nindptr - 1);
@@ -956,8 +932,6 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   std::unique_ptr<Dataset> ret;
   int32_t nrow = num_rows;
   if (reference == nullptr) {
@@ -1030,8 +1004,6 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   std::unique_ptr<Dataset> ret;
   int32_t nrow = static_cast<int32_t>(num_row);
   if (reference == nullptr) {
@@ -1116,8 +1088,6 @@ int LGBM_DatasetGetSubset(
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   auto full_dataset = reinterpret_cast<const Dataset*>(handle);
   CHECK_GT(num_used_row_indices, 0);
   const int32_t lower = 0;
@@ -1514,8 +1484,6 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header,
                        config, result_filename);
@@ -1561,8 +1529,6 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   int nrow = static_cast<int>(nindptr - 1);
@@ -1598,8 +1564,6 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   ref_booster->PredictSingleRow(num_iteration, predict_type, static_cast<int32_t>(num_col), get_row_fun, config, out_result, out_len);
@@ -1630,8 +1594,6 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   int num_threads = OMP_NUM_THREADS();
   int ncol = static_cast<int>(ncol_ptr - 1);
   std::vector<std::vector<CSC_RowIterator>> iterators(num_threads, std::vector<CSC_RowIterator>());
@@ -1677,8 +1639,6 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
   ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun,
@@ -1704,8 +1664,6 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major);
   ref_booster->PredictSingleRow(num_iteration, predict_type, ncol, get_row_fun, config, out_result, out_len);
@@ -1731,8 +1689,6 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
 
-  AdditionalConfig(&config);
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type);
   ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len);
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 4c65d158800..8312da591dd 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -8,6 +8,8 @@
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/random.h>
 
+#include <LightGBM/cuda/vector_cudahost.h>
+
 #include <limits>
 
 namespace LightGBM {
@@ -208,6 +210,11 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
   GetMetricType(params, &metric);
   GetObjectiveType(params, &objective);
   GetDeviceType(params, &device_type);
+#ifdef USE_CUDA
+  if (device_type == std::string("cuda")) {
+    LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
+  }
+#endif
   GetTreeLearnerType(params, &tree_learner);
 
   GetMembersFromString(params);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 817480d5c50..44e7be3db92 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -336,9 +336,11 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
   }
   auto features_in_group = NoGroup(used_features);
 
+  auto is_sparse = io_config.is_enable_sparse;
 #ifdef USE_CUDA
   if (io_config.device_type == std::string("cuda")) {
       LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
+      is_sparse = false;
   }
 #endif
 
@@ -349,7 +351,7 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
         *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
         num_sample_col, static_cast<data_size_t>(total_sample_cnt),
         used_features, num_data_, lgbm_is_gpu_used,
-        io_config.is_enable_sparse, &group_is_multi_val);
+        is_sparse, &group_is_multi_val);
   }
 
   num_features_ = 0;

From 9ab44b66b1235fd458c82addb3daf5bbf996ba7e Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 27 Jul 2020 14:25:54 +0000
Subject: [PATCH 103/119] Limit number of GPUs to one for now in CUDA.

---
 include/LightGBM/config.h             |  4 ----
 src/io/config_auto.cpp                |  5 -----
 src/treelearner/cuda_tree_learner.cpp | 13 +++++--------
 src/treelearner/cuda_tree_learner.h   |  2 +-
 4 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index c24e9613a95..2a3335c1c0a 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -954,10 +954,6 @@ struct Config {
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   bool gpu_use_dp = false;
 
-  // check = >0
-  // desc = number of gpus (CUDA implementation only)
-  int num_gpu = 1;
-
   #pragma endregion
 
   #pragma endregion
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index b35e8da49ab..807cad78502 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -294,7 +294,6 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "gpu_platform_id",
   "gpu_device_id",
   "gpu_use_dp",
-  "num_gpu",
   });
   return params;
 }
@@ -606,9 +605,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
   GetInt(params, "gpu_device_id", &gpu_device_id);
 
   GetBool(params, "gpu_use_dp", &gpu_use_dp);
-
-  GetInt(params, "num_gpu", &num_gpu);
-  CHECK_GT(num_gpu, 0);
 }
 
 std::string Config::SaveMembersToString() const {
@@ -712,7 +708,6 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[gpu_platform_id: " << gpu_platform_id << "]\n";
   str_buf << "[gpu_device_id: " << gpu_device_id << "]\n";
   str_buf << "[gpu_use_dp: " << gpu_use_dp << "]\n";
-  str_buf << "[num_gpu: " << num_gpu << "]\n";
   return str_buf.str();
 }
 
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index f4387f7c6a8..b4e41acc7fc 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -74,7 +74,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian)
   num_feature_groups_ = train_data_->num_feature_groups();
 
   // Initialize GPU buffers and kernels & LGBM_CUDA: get device info
-  InitGPU(config_->num_gpu);
+  InitGPU();
 }
 
 // some functions used for debugging the GPU histogram construction
@@ -435,7 +435,7 @@ void CUDATreeLearner::copyDenseFeature() {
 
 
 // LGBM_CUDA: InitGPU w/ num_gpu
-void CUDATreeLearner::InitGPU(int num_gpu) {
+void CUDATreeLearner::InitGPU() {
   // Get the max bin size, used for selecting best GPU kernel
   max_num_bin_ = 0;
 
@@ -481,13 +481,10 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   // LGBM_CUDA: get num_dense_feature_groups_
   CountDenseFeatureGroups();
 
-  if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_;
-
   // LGBM_CUDA: initialize GPU
-  int gpu_count;
-
-  CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count));
-  num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu;
+  CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_gpu_));
+  if (num_gpu_ > 1) num_gpu_ = 1;
+  if (num_gpu_ > num_dense_feature_groups_) num_gpu_ = num_dense_feature_groups_;
 
   // LGBM_CUDA: set cpu threads
   cpu_threads_ = reinterpret_cast<pthread_t **>(malloc(sizeof(pthread_t *)*num_gpu_));
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index cc6ad806f85..5f0111015c9 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -77,7 +77,7 @@ class CUDATreeLearner: public SerialTreeLearner {
      * \brief Initialize GPU device
      * \LGBM_CUDA: param num_gpu: number of maximum gpus
      */
-    void InitGPU(int num_gpu);
+    void InitGPU();
 
     /*!
      * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only

From 9f8a01192fa8ead4db04cb42e8afe168cc43e846 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 27 Jul 2020 15:19:58 +0000
Subject: [PATCH 104/119] Update Parameters.rst for previous check-in

---
 docs/Parameters.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 5bd392d82f8..01362fb9af3 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -1104,10 +1104,6 @@ GPU Parameters
 
    -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
 
--  ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
-
-   -  number of gpus (CUDA implementation only)
-
 .. end params list
 
 Others

From 5369a8a4738d3a7b5b80a70e44240234bc64c100 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 3 Aug 2020 15:26:22 +0000
Subject: [PATCH 105/119] Whitespace removal.

---
 src/c_api.cpp | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 3a0fb59d7a3..0b319cf828d 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -118,7 +118,6 @@ class Booster {
     if (config_.num_threads > 0) {
       omp_set_num_threads(config_.num_threads);
     }
-
     // create boosting
     if (config_.input_model.size() > 0) {
       Log::Warning("Continued train from model is not supported for c_api,\n"
@@ -891,7 +890,6 @@ int LGBM_DatasetCreateFromFile(const char* filename,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   DatasetLoader loader(config, nullptr, 1, filename);
   if (reference == nullptr) {
     if (Network::num_machines() == 1) {
@@ -922,7 +920,6 @@ int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   DatasetLoader loader(config, nullptr, 1, nullptr);
   *out = loader.ConstructFromSampleData(sample_data, sample_indices, ncol, num_per_col,
                                         num_sample_row,
@@ -1034,7 +1031,6 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   std::unique_ptr<Dataset> ret;
   int32_t total_nrow = 0;
   for (int j = 0; j < nmat; ++j) {
@@ -1126,7 +1122,6 @@ int LGBM_DatasetCreateFromCSR(const void* indptr,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   std::unique_ptr<Dataset> ret;
   auto get_row_fun = RowFunctionFromCSR<int>(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   int32_t nrow = static_cast<int32_t>(nindptr - 1);
@@ -1194,7 +1189,6 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   std::unique_ptr<Dataset> ret;
   int32_t nrow = num_rows;
   if (reference == nullptr) {
@@ -1266,7 +1260,6 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   std::unique_ptr<Dataset> ret;
   int32_t nrow = static_cast<int32_t>(num_row);
   if (reference == nullptr) {
@@ -1350,7 +1343,6 @@ int LGBM_DatasetGetSubset(
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   auto full_dataset = reinterpret_cast<const Dataset*>(handle);
   CHECK_GT(num_used_row_indices, 0);
   const int32_t lower = 0;
@@ -1746,7 +1738,6 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header,
                        config, result_filename);
@@ -1821,7 +1812,6 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR<int>(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   int nrow = static_cast<int>(nindptr - 1);
@@ -1941,7 +1931,6 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR<int>(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   ref_booster->SetSingleRowPredictor(num_iteration, predict_type, config);
@@ -2017,7 +2006,6 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   int num_threads = OMP_NUM_THREADS();
   int ncol = static_cast<int>(ncol_ptr - 1);
   std::vector<std::vector<CSC_RowIterator>> iterators(num_threads, std::vector<CSC_RowIterator>());
@@ -2062,7 +2050,6 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
   ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun,
@@ -2087,7 +2074,6 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major);
   ref_booster->SetSingleRowPredictor(num_iteration, predict_type, config);
@@ -2149,7 +2135,6 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type);
   ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len);

From 51e096cf18dd74918b1cdf944a4627fed571f86b Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 3 Aug 2020 19:31:53 +0000
Subject: [PATCH 106/119] Cleanup unused code.

---
 include/LightGBM/application.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h
index 3fda4a1c32e..66541ec006c 100644
--- a/include/LightGBM/application.h
+++ b/include/LightGBM/application.h
@@ -36,9 +36,6 @@ class Application {
   /*! \brief To call this function to run application*/
   inline void Run();
 
-  /*! \brief call to get configuration */
-  Config GetConfig() {return config_ ;}
-
  private:
   /*! \brief Load parameters from command line and config file*/
   void LoadParameters(int argc, char** argv);

From 9ca091b97fade540b5e188f0e331013a8c9db03f Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 5 Aug 2020 15:21:11 +0000
Subject: [PATCH 107/119] Changed uint/ushort/ulong to unsigned int/short/long
 to help Windows based CUDA compiler work.

---
 src/treelearner/cuda_kernel_launcher.cu       |  48 ++---
 src/treelearner/cuda_tree_learner.cpp         |   2 +-
 src/treelearner/cuda_tree_learner.h           |   2 +-
 .../kernels/histogram_16_64_256.cu            | 180 +++++++++---------
 .../kernels/histogram_16_64_256.hu            |  20 +-
 5 files changed, 126 insertions(+), 126 deletions(-)

diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
index 8b243200878..218be6d72b9 100644
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
@@ -34,20 +34,20 @@ void cuda_histogram(
       if (use_all_features) {
         if (!is_constant_hessian)
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
            histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
            histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
            histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     } else {
@@ -55,20 +55,20 @@ void cuda_histogram(
         // seems all features is always enabled, so this should be the same as fulldata
         if (!is_constant_hessian)
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     }
@@ -77,20 +77,20 @@ void cuda_histogram(
       if (use_all_features) {
         if (!is_constant_hessian)
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     } else {
@@ -98,20 +98,20 @@ void cuda_histogram(
         // seems all features is always enabled, so this should be the same as fulldata
         if (!is_constant_hessian)
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     }
@@ -120,20 +120,20 @@ void cuda_histogram(
       if (use_all_features) {
         if (!is_constant_hessian)
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     } else {
@@ -141,20 +141,20 @@ void cuda_histogram(
         // seems all features is always enabled, so this should be the same as fulldata
         if (!is_constant_hessian)
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const uint*>(arg3), arg4, arg5,
+                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     }
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index b4e41acc7fc..067c06635af 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -36,7 +36,7 @@ static void *launch_cuda_histogram(void *thread_data) {
                 td.device_features,
                 td.device_feature_masks,
                 td.num_data,
-                reinterpret_cast<uint*>(td.device_data_indices),
+                reinterpret_cast<unsigned int*>(td.device_data_indices),
                 td.leaf_num_data,
                 td.device_gradients,
                 td.device_hessians, td.hessians_const,
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 5f0111015c9..1506c5cf21b 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -118,7 +118,7 @@ class CUDATreeLearner: public SerialTreeLearner {
       td->stream                = stream_[device_id];
       td->device_features       = device_features_[device_id];
       td->device_feature_masks  = reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
-      td->device_data_indices   = reinterpret_cast<uint*>(device_data_indices_[device_id]);
+      td->device_data_indices   = reinterpret_cast<unsigned int*>(device_data_indices_[device_id]);
       td->device_gradients      = device_gradients_[device_id];
       td->device_hessians       = device_hessians_[device_id];
       td->hessians_const        = hessians_[0];
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 09d563cbaf4..d156c872ec8 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -29,29 +29,29 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) {
 #define KERNEL_NAME histogram16
 #endif  // ENABLE_ALL_FEATURES
 #define NUM_BINS 16
-#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
+#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS)
 
 // this function will be called by histogram16
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist,
-                           const uint skip_id,
-                           const uint old_val_cont_bin0,
-                           const ushort num_sub_hist,
+                           const unsigned int skip_id,
+                           const unsigned int old_val_cont_bin0,
+                           const unsigned short num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
-    const ushort ltid = threadIdx.x;
+    const unsigned short ltid = threadIdx.x;
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = reinterpret_cast<uint *>(local_hist + 2 * NUM_BINS);
+    unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
 
-    uint cont_bin;
+    unsigned int cont_bin;
     if (power_feature_workgroups != 0) {
       cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
     } else {
       cont_bin = local_cnt[ltid];
     }
-    ushort i;
+    unsigned short i;
 
     if (power_feature_workgroups != 0) {
         // add all sub-histograms for feature
@@ -113,15 +113,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
     // otherwise a "Misaligned Address" exception may occur
     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-    const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const ushort ltid = threadIdx.x;
-    const ushort lsize = NUM_BINS;  // get_local_size(0);
-    const ushort group_id = blockIdx.x;
+    const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned short ltid = threadIdx.x;
+    const unsigned short lsize = NUM_BINS;  // get_local_size(0);
+    const unsigned short group_id = blockIdx.x;
 
     // local memory per workgroup is 3 KB
     // clear local memory
-    uint *ptr = reinterpret_cast<uint *>(shared_array);
-    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+    unsigned int *ptr = reinterpret_cast<unsigned int *>(shared_array);
+    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) {
         ptr[i] = 0;
     }
     __syncthreads();
@@ -133,25 +133,25 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
 
     // counter histogram
-    // total size: 256 * size_of(uint) = 1 KB
-    uint *cnt_hist = reinterpret_cast<uint *>(gh_hist + 2 * NUM_BINS);
+    // total size: 256 * size_of(unsigned int) = 1 KB
+    unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
 
     // odd threads (1, 3, ...) compute histograms for hessians first
     // even thread (0, 2, ...) compute histograms for gradients first
     // etc.
     uchar is_hessian_first = ltid & 1;
 
-    ushort feature_id = group_id >> power_feature_workgroups;
+    unsigned short feature_id = group_id >> power_feature_workgroups;
 
     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
     // feature_size is the number of examples per feature
     const uchar *feature_data = feature_data_base + feature_id * feature_size;
 
     // size of threads that process this feature4
-    const uint subglobal_size = lsize * (1 << power_feature_workgroups);
+    const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
 
     // equavalent thread ID in this subgroup for this feature4
-    const uint subglobal_tid  = gtid - feature_id * subglobal_size;
+    const unsigned int subglobal_tid  = gtid - feature_id * subglobal_size;
 
 
     data_size_t ind;
@@ -177,7 +177,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     uchar feature;
     uchar feature_next;
     // uint8_t bin;
-    ushort bin;
+    unsigned short bin;
 
     feature = feature_data[ind >> feature_mask];
     if (feature_mask) {
@@ -197,7 +197,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     #endif
 
     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+    for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
         // prefetch the next iteration variables
         // we don't need bondary check because we have made the buffer large
         int i_next = i + subglobal_size;
@@ -280,22 +280,22 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
     // write gradients and hessians
     acc_type *__restrict__ ptr_f = output;
-    for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+    for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) {
         // even threads read gradients, odd threads read hessians
         acc_type value = gh_hist[i];
         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
     }
     // write counts
     acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-        uint value = cnt_hist[i];
+    for (unsigned short i = ltid; i < NUM_BINS; i += lsize) {
+        unsigned int value = cnt_hist[i];
         ptr_i[i] = value;
     }
     __syncthreads();
     __threadfence();
-    uint * counter_val = cnt_hist;
+    unsigned int * counter_val = cnt_hist;
     // backup the old value
-    uint old_val = *counter_val;
+    unsigned int old_val = *counter_val;
     if (ltid == 0) {
         // all workgroups processing the same feature add this counter
         *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
@@ -313,15 +313,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // only 1 work group, no need to increase counter
     // the reduction will become a simple copy
     if (1) {
-        uint old_val;  // dummy
+        unsigned int old_val;  // dummy
 #endif
         // locate our feature's block in output memory
-        uint output_offset = (feature_id << power_feature_workgroups);
+        unsigned int output_offset = (feature_id << power_feature_workgroups);
         acc_type const * __restrict__ feature_subhists =
                  reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
         // skip reading the data already in local memory
-        // uint skip_id = feature_id ^ output_offset;
-        uint skip_id = group_id - output_offset;
+        // unsigned int skip_id = feature_id ^ output_offset;
+        unsigned int skip_id = group_id - output_offset;
         // locate output histogram location for this feature4
         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
@@ -347,29 +347,29 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 #define KERNEL_NAME histogram64
 #endif  // ENABLE_ALL_FEATURES
 #define NUM_BINS 64
-#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
+#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS)
 
 // this function will be called by histogram64
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist,
-                           const uint skip_id,
-                           const uint old_val_cont_bin0,
-                           const ushort num_sub_hist,
+                           const unsigned int skip_id,
+                           const unsigned int old_val_cont_bin0,
+                           const unsigned short num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
-    const ushort ltid = threadIdx.x;
+    const unsigned short ltid = threadIdx.x;
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = reinterpret_cast<uint *>(local_hist + 2 * NUM_BINS);
+    unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
 
-    uint cont_bin;
+    unsigned int cont_bin;
     if (power_feature_workgroups != 0) {
       cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
     } else {
       cont_bin = local_cnt[ltid];
     }
-    ushort i;
+    unsigned short i;
 
     if (power_feature_workgroups != 0) {
         // add all sub-histograms for feature
@@ -431,15 +431,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
     // otherwise a "Misaligned Address" exception may occur
     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-    const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const ushort ltid = threadIdx.x;
-    const ushort lsize = NUM_BINS;  // get_local_size(0);
-    const ushort group_id = blockIdx.x;
+    const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned short ltid = threadIdx.x;
+    const unsigned short lsize = NUM_BINS;  // get_local_size(0);
+    const unsigned short group_id = blockIdx.x;
 
     // local memory per workgroup is 3 KB
     // clear local memory
-    uint *ptr = reinterpret_cast<uint *>(shared_array);
-    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+    unsigned int *ptr = reinterpret_cast<unsigned int *>(shared_array);
+    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) {
         ptr[i] = 0;
     }
     __syncthreads();
@@ -451,25 +451,25 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
 
     // counter histogram
-    // total size: 256 * size_of(uint) = 1 KB
-    uint *cnt_hist = reinterpret_cast<uint *>(gh_hist + 2 * NUM_BINS);
+    // total size: 256 * size_of(unsigned int) = 1 KB
+    unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
 
     // odd threads (1, 3, ...) compute histograms for hessians first
     // even thread (0, 2, ...) compute histograms for gradients first
     // etc.
     uchar is_hessian_first = ltid & 1;
 
-    ushort feature_id = group_id >> power_feature_workgroups;
+    unsigned short feature_id = group_id >> power_feature_workgroups;
 
     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
     // feature_size is the number of examples per feature
     const uchar *feature_data = feature_data_base + feature_id * feature_size;
 
     // size of threads that process this feature4
-    const uint subglobal_size = lsize * (1 << power_feature_workgroups);
+    const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
 
     // equavalent thread ID in this subgroup for this feature4
-    const uint subglobal_tid  = gtid - feature_id * subglobal_size;
+    const unsigned int subglobal_tid  = gtid - feature_id * subglobal_size;
 
     data_size_t ind;
     data_size_t ind_next;
@@ -494,7 +494,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     uchar feature;
     uchar feature_next;
     // uint8_t bin;
-    ushort bin;
+    unsigned short bin;
 
     feature = feature_data[ind >> feature_mask];
     if (feature_mask) {
@@ -514,7 +514,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     #endif
 
     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+    for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
         // prefetch the next iteration variables
         // we don't need bondary check because we have made the buffer large
         int i_next = i + subglobal_size;
@@ -596,22 +596,22 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
     // write gradients and hessians
     acc_type *__restrict__ ptr_f = output;
-    for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+    for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) {
         // even threads read gradients, odd threads read hessians
         acc_type value = gh_hist[i];
         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
     }
     // write counts
     acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-        uint value = cnt_hist[i];
+    for (unsigned short i = ltid; i < NUM_BINS; i += lsize) {
+        unsigned int value = cnt_hist[i];
         ptr_i[i] = value;
     }
     __syncthreads();
     __threadfence();
-    uint * counter_val = cnt_hist;
+    unsigned int * counter_val = cnt_hist;
     // backup the old value
-    uint old_val = *counter_val;
+    unsigned int old_val = *counter_val;
     if (ltid == 0) {
         // all workgroups processing the same feature add this counter
         *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
@@ -629,15 +629,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // only 1 work group, no need to increase counter
     // the reduction will become a simple copy
     if (1) {
-        uint old_val;  // dummy
+        unsigned int old_val;  // dummy
 #endif
         // locate our feature's block in output memory
-        uint output_offset = (feature_id << power_feature_workgroups);
+        unsigned int output_offset = (feature_id << power_feature_workgroups);
         acc_type const * __restrict__ feature_subhists =
                  reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
         // skip reading the data already in local memory
-        // uint skip_id = feature_id ^ output_offset;
-        uint skip_id = group_id - output_offset;
+        // unsigned int skip_id = feature_id ^ output_offset;
+        unsigned int skip_id = group_id - output_offset;
         // locate output histogram location for this feature4
         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
@@ -663,29 +663,29 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 #define KERNEL_NAME histogram256
 #endif  // ENABLE_ALL_FEATURES
 #define NUM_BINS 256
-#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
+#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS)
 
 // this function will be called by histogram256
 // we have one sub-histogram of one feature in local memory, and need to read others
 inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist,
-                           const uint skip_id,
-                           const uint old_val_cont_bin0,
-                           const ushort num_sub_hist,
+                           const unsigned int skip_id,
+                           const unsigned int old_val_cont_bin0,
+                           const unsigned short num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
-    const ushort ltid = threadIdx.x;
+    const unsigned short ltid = threadIdx.x;
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
-    uint* __restrict__ local_cnt = reinterpret_cast<uint *>(local_hist + 2 * NUM_BINS);
+    unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
 
-    uint cont_bin;
+    unsigned int cont_bin;
     if (power_feature_workgroups != 0) {
       cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
     } else {
       cont_bin = local_cnt[ltid];
     }
-    ushort i;
+    unsigned short i;
 
     if (power_feature_workgroups != 0) {
         // add all sub-histograms for feature
@@ -748,15 +748,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
     // otherwise a "Misaligned Address" exception may occur
     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-    const uint gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const ushort ltid = threadIdx.x;
-    const ushort lsize = NUM_BINS;  // get_local_size(0);
-    const ushort group_id = blockIdx.x;
+    const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned short ltid = threadIdx.x;
+    const unsigned short lsize = NUM_BINS;  // get_local_size(0);
+    const unsigned short group_id = blockIdx.x;
 
     // local memory per workgroup is 3 KB
     // clear local memory
-    uint *ptr = reinterpret_cast<uint *>(shared_array);
-    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+    unsigned int *ptr = reinterpret_cast<unsigned int *>(shared_array);
+    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) {
         ptr[i] = 0;
     }
     __syncthreads();
@@ -768,25 +768,25 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
 
     // counter histogram
-    // total size: 256 * size_of(uint) = 1 KB
-    uint *cnt_hist = reinterpret_cast<uint *>(gh_hist + 2 * NUM_BINS);
+    // total size: 256 * size_of(unsigned int) = 1 KB
+    unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
 
     // odd threads (1, 3, ...) compute histograms for hessians first
     // even thread (0, 2, ...) compute histograms for gradients first
     // etc.
     uchar is_hessian_first = ltid & 1;
 
-    ushort feature_id = group_id >> power_feature_workgroups;
+    unsigned short feature_id = group_id >> power_feature_workgroups;
 
     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
     // feature_size is the number of examples per feature
     const uchar *feature_data = feature_data_base + feature_id * feature_size;
 
     // size of threads that process this feature4
-    const uint subglobal_size = lsize * (1 << power_feature_workgroups);
+    const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
 
     // equavalent thread ID in this subgroup for this feature4
-    const uint subglobal_tid  = gtid - feature_id * subglobal_size;
+    const unsigned int subglobal_tid  = gtid - feature_id * subglobal_size;
 
     data_size_t ind;
     data_size_t ind_next;
@@ -811,7 +811,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     uchar feature;
     uchar feature_next;
     // uint8_t bin;
-    ushort bin;
+    unsigned short bin;
 
     feature = feature_data[ind >> feature_mask];
     if (feature_mask) {
@@ -831,7 +831,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     #endif
 
     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+    for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
         // prefetch the next iteration variables
         // we don't need bondary check because we have made the buffer large
         int i_next = i + subglobal_size;
@@ -913,22 +913,22 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
     // write gradients and hessians
     acc_type *__restrict__ ptr_f = output;
-    for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+    for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) {
         // even threads read gradients, odd threads read hessians
         acc_type value = gh_hist[i];
         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
     }
     // write counts
     acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-        uint value = cnt_hist[i];
+    for (unsigned short i = ltid; i < NUM_BINS; i += lsize) {
+        unsigned int value = cnt_hist[i];
         ptr_i[i] = value;
     }
     __syncthreads();
     __threadfence();
-    uint * counter_val = cnt_hist;
+    unsigned int * counter_val = cnt_hist;
     // backup the old value
-    uint old_val = *counter_val;
+    unsigned int old_val = *counter_val;
     if (ltid == 0) {
         // all workgroups processing the same feature add this counter
         *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
@@ -946,15 +946,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // only 1 work group, no need to increase counter
     // the reduction will become a simple copy
     if (1) {
-        uint old_val;  // dummy
+        unsigned int old_val;  // dummy
 #endif
         // locate our feature's block in output memory
-        uint output_offset = (feature_id << power_feature_workgroups);
+        unsigned int output_offset = (feature_id << power_feature_workgroups);
         acc_type const * __restrict__ feature_subhists =
                  reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
         // skip reading the data already in local memory
-        // uint skip_id = feature_id ^ output_offset;
-        uint skip_id = group_id - output_offset;
+        // unsigned int skip_id = feature_id ^ output_offset;
+        unsigned int skip_id = group_id - output_offset;
         // locate output histogram location for this feature4
         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
 
diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
index e228d3b0068..8cc464dfb32 100644
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ b/src/treelearner/kernels/histogram_16_64_256.hu
@@ -28,9 +28,9 @@ __device__ double as_double(const T t) {
   return d;
 }
 template<typename T>
-__device__ ulong as_ulong(const T t) {
-  static_assert(sizeof(T) == sizeof(ulong), "size mismatch");
-  ulong u; 
+__device__ unsigned long long as_ulong_ulong(const T t) {
+  static_assert(sizeof(T) == sizeof(unsigned long long), "size mismatch");
+  unsigned long long u; 
   memcpy(&u, &t, sizeof(T)); 
   return u;
 }
@@ -42,9 +42,9 @@ __device__ float as_float(const T t) {
   return f;
 }
 template<typename T>
-__device__ uint as_uint(const T t) {
-  static_assert(sizeof(T) == sizeof(uint), "size_mismatch");
-  uint u; 
+__device__ unsigned int as_uint(const T t) {
+  static_assert(sizeof(T) == sizeof(unsigned int), "size_mismatch");
+  unsigned int u; 
   memcpy(&u, &t, sizeof(T)); 
   return u;
 }
@@ -58,12 +58,12 @@ __device__ uchar4 as_uchar4(const T t) {
 
 #if USE_DP_FLOAT == 1
 typedef double acc_type;
-typedef ulong acc_int_type;
+typedef unsigned long long acc_int_type;
 #define as_acc_type as_double
-#define as_acc_int_type as_ulong
+#define as_acc_int_type as_ulong_ulong
 #else
 typedef float acc_type;
-typedef uint acc_int_type;
+typedef unsigned int acc_int_type;
 #define as_acc_type as_float
 #define as_acc_int_type as_uint
 #endif
@@ -73,7 +73,7 @@ typedef uint acc_int_type;
 #define ENABLE_ALL_FEATURES 1
 #endif
 
-typedef uint data_size_t;
+typedef unsigned int data_size_t;
 typedef float score_t;
 
 // define all of the different kernels

From 7fcecff7db1d19525636fd3bbb591385b2b72261 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 5 Aug 2020 16:47:57 +0000
Subject: [PATCH 108/119] Lint change from previous check-in.

---
 .../kernels/histogram_16_64_256.cu            | 64 ++++++++++---------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index d156c872ec8..f57e8f9d838 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -4,7 +4,9 @@
  */
 
 #include "histogram_16_64_256.hu"
-#include "stdio.h"
+
+#include <cstdint>
+#include <cstdio>
 
 #define PRINT(b, t, fmt, ...) \
 if (b == gtid && t == ltid) { \
@@ -36,11 +38,11 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) {
 inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist,
                            const unsigned int skip_id,
                            const unsigned int old_val_cont_bin0,
-                           const unsigned short num_sub_hist,
+                           const uint16_t num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
-    const unsigned short ltid = threadIdx.x;
+    const uint16_t ltid = threadIdx.x;
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
     unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
@@ -51,7 +53,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__
     } else {
       cont_bin = local_cnt[ltid];
     }
-    unsigned short i;
+    uint16_t i;
 
     if (power_feature_workgroups != 0) {
         // add all sub-histograms for feature
@@ -114,9 +116,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // otherwise a "Misaligned Address" exception may occur
     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
     const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned short ltid = threadIdx.x;
-    const unsigned short lsize = NUM_BINS;  // get_local_size(0);
-    const unsigned short group_id = blockIdx.x;
+    const uint16_t ltid = threadIdx.x;
+    const uint16_t lsize = NUM_BINS;  // get_local_size(0);
+    const uint16_t group_id = blockIdx.x;
 
     // local memory per workgroup is 3 KB
     // clear local memory
@@ -141,7 +143,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // etc.
     uchar is_hessian_first = ltid & 1;
 
-    unsigned short feature_id = group_id >> power_feature_workgroups;
+    uint16_t feature_id = group_id >> power_feature_workgroups;
 
     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
     // feature_size is the number of examples per feature
@@ -177,7 +179,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     uchar feature;
     uchar feature_next;
     // uint8_t bin;
-    unsigned short bin;
+    uint16_t bin;
 
     feature = feature_data[ind >> feature_mask];
     if (feature_mask) {
@@ -280,14 +282,14 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
     // write gradients and hessians
     acc_type *__restrict__ ptr_f = output;
-    for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) {
+    for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
         // even threads read gradients, odd threads read hessians
         acc_type value = gh_hist[i];
         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
     }
     // write counts
     acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (unsigned short i = ltid; i < NUM_BINS; i += lsize) {
+    for (uint16_t i = ltid; i < NUM_BINS; i += lsize) {
         unsigned int value = cnt_hist[i];
         ptr_i[i] = value;
     }
@@ -354,11 +356,11 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist,
                            const unsigned int skip_id,
                            const unsigned int old_val_cont_bin0,
-                           const unsigned short num_sub_hist,
+                           const uint16_t num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
-    const unsigned short ltid = threadIdx.x;
+    const uint16_t ltid = threadIdx.x;
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
     unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
@@ -369,7 +371,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__
     } else {
       cont_bin = local_cnt[ltid];
     }
-    unsigned short i;
+    uint16_t i;
 
     if (power_feature_workgroups != 0) {
         // add all sub-histograms for feature
@@ -432,9 +434,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // otherwise a "Misaligned Address" exception may occur
     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
     const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned short ltid = threadIdx.x;
-    const unsigned short lsize = NUM_BINS;  // get_local_size(0);
-    const unsigned short group_id = blockIdx.x;
+    const uint16_t ltid = threadIdx.x;
+    const uint16_t lsize = NUM_BINS;  // get_local_size(0);
+    const uint16_t group_id = blockIdx.x;
 
     // local memory per workgroup is 3 KB
     // clear local memory
@@ -459,7 +461,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // etc.
     uchar is_hessian_first = ltid & 1;
 
-    unsigned short feature_id = group_id >> power_feature_workgroups;
+    uint16_t feature_id = group_id >> power_feature_workgroups;
 
     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
     // feature_size is the number of examples per feature
@@ -494,7 +496,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     uchar feature;
     uchar feature_next;
     // uint8_t bin;
-    unsigned short bin;
+    uint16_t bin;
 
     feature = feature_data[ind >> feature_mask];
     if (feature_mask) {
@@ -596,14 +598,14 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
     // write gradients and hessians
     acc_type *__restrict__ ptr_f = output;
-    for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) {
+    for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
         // even threads read gradients, odd threads read hessians
         acc_type value = gh_hist[i];
         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
     }
     // write counts
     acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (unsigned short i = ltid; i < NUM_BINS; i += lsize) {
+    for (uint16_t i = ltid; i < NUM_BINS; i += lsize) {
         unsigned int value = cnt_hist[i];
         ptr_i[i] = value;
     }
@@ -670,11 +672,11 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist,
                            const unsigned int skip_id,
                            const unsigned int old_val_cont_bin0,
-                           const unsigned short num_sub_hist,
+                           const uint16_t num_sub_hist,
                            acc_type* __restrict__ output_buf,
                            acc_type* __restrict__ local_hist,
                            const size_t power_feature_workgroups) {
-    const unsigned short ltid = threadIdx.x;
+    const uint16_t ltid = threadIdx.x;
     acc_type grad_bin = local_hist[ltid * 2];
     acc_type hess_bin = local_hist[ltid * 2 + 1];
     unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
@@ -685,7 +687,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__
     } else {
       cont_bin = local_cnt[ltid];
     }
-    unsigned short i;
+    uint16_t i;
 
     if (power_feature_workgroups != 0) {
         // add all sub-histograms for feature
@@ -749,9 +751,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // otherwise a "Misaligned Address" exception may occur
     __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
     const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned short ltid = threadIdx.x;
-    const unsigned short lsize = NUM_BINS;  // get_local_size(0);
-    const unsigned short group_id = blockIdx.x;
+    const uint16_t ltid = threadIdx.x;
+    const uint16_t lsize = NUM_BINS;  // get_local_size(0);
+    const uint16_t group_id = blockIdx.x;
 
     // local memory per workgroup is 3 KB
     // clear local memory
@@ -776,7 +778,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // etc.
     uchar is_hessian_first = ltid & 1;
 
-    unsigned short feature_id = group_id >> power_feature_workgroups;
+    uint16_t feature_id = group_id >> power_feature_workgroups;
 
     // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
     // feature_size is the number of examples per feature
@@ -811,7 +813,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     uchar feature;
     uchar feature_next;
     // uint8_t bin;
-    unsigned short bin;
+    uint16_t bin;
 
     feature = feature_data[ind >> feature_mask];
     if (feature_mask) {
@@ -913,14 +915,14 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
     // write gradients and hessians
     acc_type *__restrict__ ptr_f = output;
-    for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) {
+    for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
         // even threads read gradients, odd threads read hessians
         acc_type value = gh_hist[i];
         ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
     }
     // write counts
     acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (unsigned short i = ltid; i < NUM_BINS; i += lsize) {
+    for (uint16_t i = ltid; i < NUM_BINS; i += lsize) {
         unsigned int value = cnt_hist[i];
         ptr_i[i] = value;
     }

From 05274d4af60d50d8e2680a38e8d584113d3f6228 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 14 Aug 2020 11:58:29 +0000
Subject: [PATCH 109/119] Changes based on reviewers comments.

---
 CMakeLists.txt | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58e5d86632c..2c9157fb034 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,18 +1,12 @@
 if(USE_GPU OR APPLE)
   cmake_minimum_required(VERSION 3.2)
 elseif(USE_CUDA)
-  cmake_minimum_required(VERSION 3.11)
-  enable_language(CUDA)
+  cmake_minimum_required(VERSION 3.16)
+  PROJECT(lightgbm LANGUAGES CUDA)
 else()
   cmake_minimum_required(VERSION 2.8)
 endif()
 
-if(USE_CUDA)
-  PROJECT(lightgbm LANGUAGES C CXX CUDA)
-else()
-  PROJECT(lightgbm LANGUAGES C CXX)
-endif()
-
 PROJECT(lightgbm)
 
 OPTION(USE_MPI "Enable MPI-based parallel learning" OFF)
@@ -135,20 +129,19 @@ endif(USE_GPU)
 
 if(USE_CUDA)
     find_package(CUDA REQUIRED)
+    SET(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
     include_directories(${CUDA_INCLUDE_DIRS})
-    LIST(APPEND CMAKE_CUDA_FLAGS -g -Xcompiler=-fopenmp -Xcompiler=-fPIC -Xcompiler=-Wall -lineinfo)
-    CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS 7.0)
+    LIST(APPEND CMAKE_CUDA_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall)
+    CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS 6.0 6.1 6.2 7.0 7.5+PTX)
 
     LIST(APPEND CMAKE_CUDA_FLAGS ${CUDA_ARCH_FLAGS})
-    if(CMAKE_BUILD_TYPE MATCHES Release)
-      LIST(APPEND CMAKE_CUDA_FLAGS -03)
+    if(USE_DEBUG)
+      SET(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g")
+    else()
+      SET(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -lineinfo")
     endif()
-
-    message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
     string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
     message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
-    set(CMAKE_CUDA_FLAGS_DEBUG -G)
-    set(CMAKE_CUDA_FLAGS_RELEASE -lineinfo)
 
     ADD_DEFINITIONS(-DUSE_CUDA)
     if (NOT DEFINED CMAKE_CUDA_STANDARD)
@@ -381,12 +374,12 @@ if(USE_GPU)
 endif(USE_GPU)
 
 if(USE_CUDA)
-  set_property(TARGET lightgbm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+  set_target_properties(lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
   TARGET_LINK_LIBRARIES(
     lightgbm
     ${histograms}
   )
-  set_property(TARGET _lightgbm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+  set_target_properties(_lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
   TARGET_LINK_LIBRARIES(
     _lightgbm
     ${histograms}

From 8bb20d3433c2b3e672cdd842c6ec64749819b850 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 17 Aug 2020 13:38:23 +0000
Subject: [PATCH 110/119] More reviewer comment changes.

---
 CMakeLists.txt | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c9157fb034..79870e8c54d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,12 +2,15 @@ if(USE_GPU OR APPLE)
   cmake_minimum_required(VERSION 3.2)
 elseif(USE_CUDA)
   cmake_minimum_required(VERSION 3.16)
-  PROJECT(lightgbm LANGUAGES CUDA)
 else()
   cmake_minimum_required(VERSION 2.8)
 endif()
 
-PROJECT(lightgbm)
+if(USE_CUDA)
+  PROJECT(lightgbm LANGUAGES C CXX CUDA)
+else()
+  PROJECT(lightgbm LANGUAGES C CXX)
+endif()
 
 OPTION(USE_MPI "Enable MPI-based parallel learning" OFF)
 OPTION(USE_OPENMP "Enable OpenMP" ON)
@@ -98,6 +101,10 @@ else()
     ADD_DEFINITIONS(-DUSE_SOCKET)
 endif(USE_MPI)
 
+if(USE_CUDA)
+    SET(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
+endif(USE_CUDA)
+
 if(USE_OPENMP)
     find_package(OpenMP REQUIRED)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
@@ -129,7 +136,6 @@ endif(USE_GPU)
 
 if(USE_CUDA)
     find_package(CUDA REQUIRED)
-    SET(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
     include_directories(${CUDA_INCLUDE_DIRS})
     LIST(APPEND CMAKE_CUDA_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall)
     CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS 6.0 6.1 6.2 7.0 7.5+PTX)
@@ -187,7 +193,6 @@ if(USE_CUDA)
       add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}")
       add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}")
     endforeach()
-
 endif(USE_CUDA)
 
 if(USE_HDFS)

From cc6d348cf77323a1ab09341f692d89ace1d5484f Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 21 Aug 2020 18:44:21 +0000
Subject: [PATCH 111/119] Adding warning for is_sparse. Revert tmp_subset code.
 Only return FeatureGroupData if not is_multi_val_

---
 include/LightGBM/feature_group.h |  3 +++
 src/boosting/gbdt.cpp            | 15 +++------------
 src/io/dataset.cpp               |  3 +++
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index 2e0db94f19c..3ba5c143f85 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -233,6 +233,9 @@ class FeatureGroup {
   }
 
   inline void* FeatureGroupData() {
+    if (is_multi_val_) {
+      return nullptr;
+    }
     return bin_data_->get_data();
   }
 
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index b63bdc1ec0f..2c9fb3b734e 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -801,18 +801,9 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
     }
   } else {
     bag_data_cnt_ = num_data_;
-    if (config_->device_type == std::string("cuda")) {
-      if (tmp_subset_ == nullptr) {
-        tmp_subset_.reset(new Dataset(bag_data_cnt_));
-        tmp_subset_->CopyFeatureMapperFrom(train_data_);
-        is_use_subset_ = false;
-        bag_data_indices_.clear();
-      }
-    } else {
-      bag_data_indices_.clear();
-      bagging_runner_.ReSize(0);
-      is_use_subset_ = false;
-    }
+    bag_data_indices_.clear();
+    bagging_runner_.ReSize(0);
+    is_use_subset_ = false;
   }
 }
 
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 44e7be3db92..b7689288874 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -340,6 +340,9 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
 #ifdef USE_CUDA
   if (io_config.device_type == std::string("cuda")) {
       LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
+      if (is_sparse) {
+        Log::Warning("Using sparse features with CUDA is currently not supported.");
+      }
       is_sparse = false;
   }
 #endif

From 5f3f1e023dce5abaa0c4e13761662bd7da7d1888 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 24 Aug 2020 11:48:58 +0000
Subject: [PATCH 112/119] Fix so that CUDA code will compile even if you enable
 the SCORE_T_USE_DOUBLE define.

---
 src/c_api.cpp                                 |  6 +-
 src/treelearner/cuda_kernel_launcher.cu       | 76 ++++++++++---------
 src/treelearner/cuda_kernel_launcher.h        |  3 +
 src/treelearner/cuda_tree_learner.cpp         |  6 +-
 src/treelearner/cuda_tree_learner.h           |  2 +-
 .../kernels/histogram_16_64_256.cu            |  7 +-
 .../kernels/histogram_16_64_256.hu            | 12 +--
 7 files changed, 65 insertions(+), 47 deletions(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 61b3038e660..a389e8e47b1 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1611,10 +1611,14 @@ int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
                                     const float* hess,
                                     int* is_finished) {
   API_BEGIN();
-  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   #ifdef SCORE_T_USE_DOUBLE
+  (void) handle;       // UNUSED VARIABLE
+  (void) grad;         // UNUSED VARIABLE
+  (void) hess;         // UNUSED VARIABLE
+  (void) is_finished;  // UNUSED VARIABLE
   Log::Fatal("Don't support custom loss function when SCORE_T_USE_DOUBLE is enabled");
   #else
+  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   if (ref_booster->TrainOneIter(grad, hess)) {
     *is_finished = 1;
   } else {
diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
index 218be6d72b9..87265ffd956 100644
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
@@ -9,6 +9,8 @@
 #include <cuda_runtime.h>
 #include <cstdio>
 
+namespace LightGBM {
+
 void cuda_histogram(
                 int             histogram_size,
                 data_size_t     leaf_num_data,
@@ -34,20 +36,20 @@ void cuda_histogram(
       if (use_all_features) {
         if (!is_constant_hessian)
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
            histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
            histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
            histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     } else {
@@ -55,20 +57,20 @@ void cuda_histogram(
         // seems all features is always enabled, so this should be the same as fulldata
         if (!is_constant_hessian)
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     }
@@ -77,20 +79,20 @@ void cuda_histogram(
       if (use_all_features) {
         if (!is_constant_hessian)
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     } else {
@@ -98,20 +100,20 @@ void cuda_histogram(
         // seems all features is always enabled, so this should be the same as fulldata
         if (!is_constant_hessian)
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     }
@@ -120,20 +122,20 @@ void cuda_histogram(
       if (use_all_features) {
         if (!is_constant_hessian)
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     } else {
@@ -141,24 +143,26 @@ void cuda_histogram(
         // seems all features is always enabled, so this should be the same as fulldata
         if (!is_constant_hessian)
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       } else {
         if (!is_constant_hessian)
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
-                  static_cast<float*>(arg6), arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
+                  arg3, arg4, arg5,
+                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
         else
           histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  reinterpret_cast<const unsigned int*>(arg3), arg4, arg5,
+                  arg3, arg4, arg5,
                   arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
       }
     }
   }
 }
 
+}  // namespace LightGBM
+
 #endif  // USE_CUDA
diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h
index 6b6b7cb4b01..faa2b436de2 100644
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
@@ -9,6 +9,8 @@
 #include <chrono>
 #include "kernels/histogram_16_64_256.hu"  // kernel, acc_type, data_size_t, uchar, score_t
 
+namespace LightGBM {
+
 struct ThreadData {
           // device id
           int             device_id;
@@ -64,6 +66,7 @@ void cuda_histogram(
                 void*           arg9,
                 size_t          exp_workgroups_per_feature);
 
+}  // namespace LightGBM
 
 #endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 067c06635af..306d6700c22 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -19,6 +19,8 @@
 
 #include "../io/dense_bin.hpp"
 
+namespace LightGBM {
+
 #define cudaMemcpy_DEBUG 0  // 1: DEBUG cudaMemcpy
 #define ResetTrainingData_DEBUG 0  // 1: Debug ResetTrainingData
 
@@ -36,7 +38,7 @@ static void *launch_cuda_histogram(void *thread_data) {
                 td.device_features,
                 td.device_feature_masks,
                 td.num_data,
-                reinterpret_cast<unsigned int*>(td.device_data_indices),
+                td.device_data_indices,
                 td.leaf_num_data,
                 td.device_gradients,
                 td.device_hessians, td.hessians_const,
@@ -49,8 +51,6 @@ static void *launch_cuda_histogram(void *thread_data) {
   return NULL;
 }
 
-namespace LightGBM {
-
 CUDATreeLearner::CUDATreeLearner(const Config* config)
   :SerialTreeLearner(config) {
   use_bagging_ = false;
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 1506c5cf21b..e8bc9d331f7 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -118,7 +118,7 @@ class CUDATreeLearner: public SerialTreeLearner {
       td->stream                = stream_[device_id];
       td->device_features       = device_features_[device_id];
       td->device_feature_masks  = reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
-      td->device_data_indices   = reinterpret_cast<unsigned int*>(device_data_indices_[device_id]);
+      td->device_data_indices   = device_data_indices_[device_id];
       td->device_gradients      = device_gradients_[device_id];
       td->device_hessians       = device_hessians_[device_id];
       td->hessians_const        = hessians_[0];
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index f57e8f9d838..5c7cfdb4a9e 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -3,11 +3,15 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#include "histogram_16_64_256.hu"
+#include <LightGBM/meta.h>
 
 #include <cstdint>
 #include <cstdio>
 
+#include "histogram_16_64_256.hu"
+
+namespace LightGBM {
+
 #define PRINT(b, t, fmt, ...) \
 if (b == gtid && t == ltid) { \
   printf(fmt, __VA_ARGS__); \
@@ -966,3 +970,4 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
 
 // end of histogram256 stuff
 
+}  // namespace LightGBM
diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
index 8cc464dfb32..8e3d3a5ec78 100644
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ b/src/treelearner/kernels/histogram_16_64_256.hu
@@ -6,7 +6,9 @@
 #ifndef LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_
 #define LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_
 
-//#pragma once
+#include "LightGBM/meta.h"
+
+namespace LightGBM {
 
 // use double precision or not
 #ifndef USE_DP_FLOAT
@@ -73,9 +75,6 @@ typedef unsigned int acc_int_type;
 #define ENABLE_ALL_FEATURES 1
 #endif
 
-typedef unsigned int data_size_t;
-typedef float score_t;
-
 // define all of the different kernels
 
 #define DECLARE_CONST_BUF(name) \
@@ -156,4 +155,7 @@ DECLARE(histogram256_allfeats);
 DECLARE(histogram256_fulldata);
 DECLARE(histogram256);
 
-#endif // _HITOGRAM_256_KERNEL_
+}  // namespace LightGBM
+
+#endif  // LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_
+

From 676807a0da6ca2da2f5b34294f6d5c5b5b61138c Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 24 Aug 2020 15:03:08 +0000
Subject: [PATCH 113/119] Reviewer comment cleanup.

---
 include/LightGBM/cuda/cuda_utils.h            |   8 +-
 include/LightGBM/cuda/vector_cudahost.h       |  32 +++--
 src/application/application.cpp               |   2 +-
 src/boosting/gbdt.h                           |   2 +-
 src/io/config.cpp                             |  16 +--
 src/io/dataset.cpp                            |   2 +-
 src/io/dense_bin.hpp                          |   1 -
 src/treelearner/cuda_kernel_launcher.cu       |   3 +
 src/treelearner/cuda_kernel_launcher.h        |   2 -
 src/treelearner/cuda_tree_learner.cpp         | 133 +++++++++---------
 src/treelearner/cuda_tree_learner.h           |  65 ++++-----
 .../kernels/histogram_16_64_256.cu            |  45 +++---
 src/treelearner/parallel_tree_learner.h       |   2 +-
 src/treelearner/serial_tree_learner.h         |   2 +-
 src/treelearner/tree_learner.cpp              |   2 +-
 15 files changed, 144 insertions(+), 173 deletions(-)

diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index 3c0264cb396..b94b12d1c92 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2020 IBM Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#ifndef LIGHTGBM_CUDA_UTILS_H_
-#define LIGHTGBM_CUDA_UTILS_H_
+#ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_
+#define LIGHTGBM_CUDA_CUDA_UTILS_H_
 
 #ifdef USE_CUDA
 
@@ -19,6 +19,6 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
   }
 }
 
-#endif /* USE_CUDA */
+#endif  // USE_CUDA
 
-#endif
+#endif  // LIGHTGBM_CUDA_CUDA_UTILS_H_
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index a5d97370261..60a82cc8391 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -5,21 +5,27 @@
 #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
 #define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
 
+#include <LightGBM/utils/common.h>
+
 #ifdef USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
 #include <stdio.h>
 
-namespace LightGBM {
+enum LGBM_Device {
+  lgbm_device_cpu,
+  lgbm_device_gpu,
+  lgbm_device_cuda
+};
 
-#define lgbm_device_cpu 0
-#define lgbm_device_gpu 1
-#define lgbm_device_cuda 2
+enum Use_Learner {
+  use_cpu_learner,
+  use_gpu_learner,
+  use_cuda_learner
+};
 
-#define use_cpu_learner 0
-#define use_gpu_learner 1
-#define use_cuda_learner 2
+namespace LightGBM {
 
 class LGBM_config_ {
  public:
@@ -43,13 +49,13 @@ struct CHAllocator {
         cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
         if (ret != cudaSuccess) {
           fprintf(stderr, "   TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr);
-          ptr = reinterpret_cast<T*>(malloc(n*sizeof(T)));
+          ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
         }
       } else {
-        ptr = reinterpret_cast<T*>(malloc(n*sizeof(T)));
+        ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
       }
     #else
-      ptr = reinterpret_cast<T*>(malloc(n*sizeof(T)));
+      ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
     #endif
     return ptr;
   }
@@ -65,10 +71,10 @@ struct CHAllocator {
           cudaFreeHost(p);
         }
       } else {
-        free(p);
+        _mm_free(p);
       }
     #else
-      free(p);
+      _mm_free(p);
     #endif
   }
 };
@@ -77,4 +83,4 @@ bool operator==(const CHAllocator<T>&, const CHAllocator<U>&);
 template <class T, class U>
 bool operator!=(const CHAllocator<T>&, const CHAllocator<U>&);
 
-#endif
+#endif  // LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
diff --git a/src/application/application.cpp b/src/application/application.cpp
index bd8f103acbe..c62cdd711e0 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -11,10 +11,10 @@
 #include <LightGBM/network.h>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/prediction_early_stop.h>
+#include <LightGBM/cuda/vector_cudahost.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/text_reader.h>
-#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <string>
 #include <chrono>
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 865a64dfe3b..0d38385d5f0 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -8,9 +8,9 @@
 #include <LightGBM/boosting.h>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/prediction_early_stop.h>
+#include <LightGBM/cuda/vector_cudahost.h>
 #include <LightGBM/utils/json11.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <string>
 #include <algorithm>
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 8312da591dd..f0a9544e3b9 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -4,12 +4,11 @@
  */
 #include <LightGBM/config.h>
 
+#include <LightGBM/cuda/vector_cudahost.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/random.h>
 
-#include <LightGBM/cuda/vector_cudahost.h>
-
 #include <limits>
 
 namespace LightGBM {
@@ -328,20 +327,15 @@ void Config::CheckParamConflict() {
       num_leaves = static_cast<int>(full_num_leaves);
     }
   }
-  // force col-wise for gpu
-  if (device_type == std::string("gpu")) {
-    force_col_wise = true;
-    force_row_wise = false;
-  }
-
-  // force col-wise for CUDA
-  if (device_type == std::string("cuda")) {
+  // force col-wise for gpu & CUDA
+  if (device_type == std::string("gpu") || device_type == std::string("cuda")) {
     force_col_wise = true;
     force_row_wise = false;
   }
 
   // force gpu_use_dp for CUDA
-  if (device_type == std::string("cuda")) {
+  if (device_type == std::string("cuda") && !gpu_use_dp) {
+    Log::Warning("CUDA currently requires double precision calculations.");
     gpu_use_dp = true;
   }
 
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index b7689288874..2d9693bc695 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -6,10 +6,10 @@
 #include <LightGBM/dataset.h>
 
 #include <LightGBM/feature_group.h>
+#include <LightGBM/cuda/vector_cudahost.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <chrono>
 #include <cstdio>
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index c5a95d6af79..4a1cc43fa79 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -8,7 +8,6 @@
 
 #include <LightGBM/bin.h>
 #include <LightGBM/cuda/vector_cudahost.h>
-#include <LightGBM/utils/openmp_wrapper.h>
 
 #include <cstdint>
 #include <cstring>
diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
index 87265ffd956..8ceb5b813c9 100644
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
@@ -5,8 +5,11 @@
 #ifdef USE_CUDA
 
 #include "cuda_kernel_launcher.h"
+
 #include <LightGBM/utils/log.h>
+
 #include <cuda_runtime.h>
+
 #include <cstdio>
 
 namespace LightGBM {
diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h
index faa2b436de2..0714e05b2f2 100644
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
@@ -24,9 +24,7 @@ struct ThreadData {
           cudaStream_t    stream;
           uint8_t*        device_features;
           uint8_t*        device_feature_masks;
-          // data_size_t     num_data;
           data_size_t*    device_data_indices;
-          // data_size_t     leaf_num_data;
           score_t*        device_gradients;
           score_t*        device_hessians;
           score_t         hessians_const;
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 306d6700c22..813f99d4ba8 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -5,17 +5,17 @@
 #ifdef USE_CUDA
 #include "cuda_tree_learner.h"
 
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/network.h>
 #include <LightGBM/bin.h>
+#include <LightGBM/network.h>
+#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/common.h>
 
 #include <pthread.h>
 
-#include <LightGBM/cuda/cuda_utils.h>
-
 #include <algorithm>
-#include <vector>
 #include <cinttypes>
+#include <vector>
 
 #include "../io/dense_bin.hpp"
 
@@ -24,7 +24,7 @@ namespace LightGBM {
 #define cudaMemcpy_DEBUG 0  // 1: DEBUG cudaMemcpy
 #define ResetTrainingData_DEBUG 0  // 1: Debug ResetTrainingData
 
-#define GPU_DEBUG 0
+#define CUDA_DEBUG 0
 
 static void *launch_cuda_histogram(void *thread_data) {
   ThreadData td = *(reinterpret_cast<ThreadData*>(thread_data));
@@ -33,18 +33,18 @@ static void *launch_cuda_histogram(void *thread_data) {
 
   // launch cuda kernel
   cuda_histogram(td.histogram_size,
-                td.leaf_num_data, td.num_data, td.use_all_features,
-                td.is_constant_hessian, td.num_workgroups, td.stream,
-                td.device_features,
-                td.device_feature_masks,
-                td.num_data,
-                td.device_data_indices,
-                td.leaf_num_data,
-                td.device_gradients,
-                td.device_hessians, td.hessians_const,
-                td.device_subhistograms, td.sync_counters,
-                td.device_histogram_outputs,
-                td.exp_workgroups_per_feature);
+                 td.leaf_num_data, td.num_data, td.use_all_features,
+                 td.is_constant_hessian, td.num_workgroups, td.stream,
+                 td.device_features,
+                 td.device_feature_masks,
+                 td.num_data,
+                 td.device_data_indices,
+                 td.leaf_num_data,
+                 td.device_gradients,
+                 td.device_hessians, td.hessians_const,
+                 td.device_subhistograms, td.sync_counters,
+                 td.device_histogram_outputs,
+                 td.exp_workgroups_per_feature);
 
   CUDASUCCESS_OR_FATAL(cudaGetLastError());
 
@@ -73,12 +73,12 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian)
   // some additional variables needed for GPU trainer
   num_feature_groups_ = train_data_->num_feature_groups();
 
-  // Initialize GPU buffers and kernels & LGBM_CUDA: get device info
+  // Initialize GPU buffers and kernels: get device info
   InitGPU();
 }
 
 // some functions used for debugging the GPU histogram construction
-#if GPU_DEBUG > 0
+#if CUDA_DEBUG > 0
 
 void PrintHistograms(hist_t* h, size_t size) {
   double total_hess = 0;
@@ -169,7 +169,7 @@ int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
   int exp_workgroups_per_feature = static_cast<int>(ceil(log2(x)));
   double t = leaf_num_data / 1024.0;
 
-  Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons"
+  Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons\n"
          "Best workgroup size per feature for full utilization is %d\n", static_cast<int>(ceil(t)), (1 << exp_workgroups_per_feature));
 
   exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast<int>(ceil(log(static_cast<double>(t))/log(2.0))));
@@ -188,7 +188,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
   // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
   int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
   std::vector<int> num_gpu_workgroups;
-  ThreadData *thread_data = reinterpret_cast<ThreadData*>(malloc(sizeof(ThreadData) * num_gpu_));
+  ThreadData *thread_data = reinterpret_cast<ThreadData*>(_mm_malloc(sizeof(ThreadData) * num_gpu_, 16));
 
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
@@ -197,7 +197,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
     if (num_workgroups > preallocd_max_num_wg_[device_id]) {
       preallocd_max_num_wg_.at(device_id) = num_workgroups;
       CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast<size_t>(num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))));
     }
     // set thread_data
     SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features,
@@ -206,16 +206,14 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu
 
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, reinterpret_cast<void *>(&thread_data[device_id]))) {
-        fprintf(stderr, "Error in creating threads. Exiting\n");
-        exit(0);
+      Log::Fatal("Error in creating threads.");
     }
   }
 
   /* Wait for the threads to finish */
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     if (pthread_join(*(cpu_threads_[device_id]), NULL)) {
-      fprintf(stderr, "Error in joining threads. Exiting\n");
-      exit(0);
+      Log::Fatal("Error in joining threads.");
     }
   }
 
@@ -287,7 +285,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
     return;
   }
 
-  // LGBM_CUDA: calculate number of feature groups per gpu
+  // calculate number of feature groups per gpu
   num_gpu_feature_groups_.resize(num_gpu_);
   offset_gpu_feature_groups_.resize(num_gpu_);
   int num_features_per_gpu = num_dense_feature_groups_ / num_gpu_;
@@ -297,7 +295,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
 
   for (int i = 0; i < num_gpu_; ++i) {
     offset_gpu_feature_groups_.at(i) = offset;
-    num_gpu_feature_groups_.at(i) = (i < remain_features)? num_features_per_gpu + 1 : num_features_per_gpu;
+    num_gpu_feature_groups_.at(i) = (i < remain_features) ? num_features_per_gpu + 1 : num_features_per_gpu;
     offset += num_gpu_feature_groups_.at(i);
   }
 
@@ -317,7 +315,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
   nthreads_ = std::max(nthreads_, 1);
 }
 
-// LGBM_CUDA: allocate GPU memory for each GPU
+// allocate GPU memory for each GPU
 void CUDATreeLearner::AllocateGPUMemory() {
   #pragma omp parallel for schedule(static, num_gpu_)
 
@@ -328,11 +326,11 @@ void CUDATreeLearner::AllocateGPUMemory() {
       CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
 
       // allocate memory for all features
-      if ( device_features_[device_id] != NULL ) {
-             CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id]));
+      if (device_features_[device_id] != NULL) {
+        CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id]));
       }
 
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_features_[device_id]),  (size_t)num_gpu_feature_groups * num_data_ * sizeof(uint8_t)));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_features_[device_id]),  static_cast<size_t>(num_gpu_feature_groups * num_data_ * sizeof(uint8_t))));
       Log::Debug("Allocated device_features_ addr=%p sz=%lu", device_features_[device_id], num_gpu_feature_groups * num_data_);
 
       // allocate space for gradients and hessians on device
@@ -349,17 +347,17 @@ void CUDATreeLearner::AllocateGPUMemory() {
          CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id]));
       }
 
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), (size_t) allocated_num_data_ * sizeof(score_t)));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]),  (size_t) allocated_num_data_ * sizeof(score_t)));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), static_cast<size_t>(allocated_num_data_ * sizeof(score_t))));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]),  static_cast<size_t>(allocated_num_data_ * sizeof(score_t))));
 
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), (size_t) num_gpu_feature_groups));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), static_cast<size_t>(num_gpu_feature_groups)));
 
       // copy indices to the device
       if (device_data_indices_[device_id] != NULL) {
         CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id]));
       }
 
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), (size_t) allocated_num_data_ * sizeof(data_size_t)));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), static_cast<size_t>(allocated_num_data_ * sizeof(data_size_t))));
       CUDASUCCESS_OR_FATAL(cudaMemsetAsync(device_data_indices_[device_id], 0, allocated_num_data_ * sizeof(data_size_t), stream_[device_id]));
 
       Log::Debug("Memset device_data_indices_");
@@ -368,19 +366,19 @@ void CUDATreeLearner::AllocateGPUMemory() {
       // each work group generates a sub-histogram of dword_features_ features.
       if (!device_subhistograms_[device_id]) {
         // only initialize once here, as this will not need to change when ResetTrainingData() is called
-        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)));
+        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast<size_t>(preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))));
 
         Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]);
       }
 
       // create atomic counters for inter-group coordination
       CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int)));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), static_cast<size_t>(num_gpu_feature_groups * sizeof(int))));
       CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int), stream_[device_id]));
 
       // The output buffer is allocated to host directly, to overlap compute and data transfer
       CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), (size_t) num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_));
+      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), static_cast<size_t>(num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_)));
     }
   }
 }
@@ -399,7 +397,7 @@ void CUDATreeLearner::copyDenseFeature() {
 
   Log::Debug("Started copying dense features from CPU to GPU");
   // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
-  size_t  copied_feature = 0;
+  size_t copied_feature = 0;
   // set device info
   int device_id = 0;
   uint8_t* device_features = device_features_[device_id];
@@ -412,12 +410,12 @@ void CUDATreeLearner::copyDenseFeature() {
       dense_feature_group_map_.push_back(i);
       auto sizes_in_byte = train_data_->FeatureGroupSizesInByte(i);
       void* tmp_data = train_data_->FeatureGroupData(i);
-           Log::Debug("Started copying dense features from CPU to GPU - 2");
+      Log::Debug("Started copying dense features from CPU to GPU - 2");
       CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(&device_features[copied_feature * num_data_], tmp_data, sizes_in_byte, cudaMemcpyHostToDevice, stream_[device_id]));
-           Log::Debug("Started copying dense features from CPU to GPU - 3");
+      Log::Debug("Started copying dense features from CPU to GPU - 3");
       copied_feature++;
       // reset device info
-      if (copied_feature == (size_t) num_gpu_feature_groups_[device_id]) {
+      if (copied_feature == static_cast<size_t>(num_gpu_feature_groups_[device_id])) {
          CUDASUCCESS_OR_FATAL(cudaEventRecord(features_future_[device_id], stream_[device_id]));
          device_id += 1;
          copied_feature = 0;
@@ -434,24 +432,24 @@ void CUDATreeLearner::copyDenseFeature() {
 
 
 
-// LGBM_CUDA: InitGPU w/ num_gpu
+// InitGPU w/ num_gpu
 void CUDATreeLearner::InitGPU() {
   // Get the max bin size, used for selecting best GPU kernel
   max_num_bin_ = 0;
 
-  #if GPU_DEBUG >= 1
+  #if CUDA_DEBUG >= 1
   printf("bin_size: ");
   #endif
   for (int i = 0; i < num_feature_groups_; ++i) {
     if (train_data_->IsMultiGroup(i)) {
       continue;
     }
-    #if GPU_DEBUG >= 1
+    #if CUDA_DEBUG >= 1
     printf("%d, ", train_data_->FeatureGroupNumBin(i));
     #endif
     max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
   }
-  #if GPU_DEBUG >= 1
+  #if CUDA_DEBUG >= 1
   printf("\n");
   #endif
 
@@ -478,21 +476,24 @@ void CUDATreeLearner::InitGPU() {
     Log::Warning("Setting max_bin to 15 is sugguested for best performance");
   }
 
-  // LGBM_CUDA: get num_dense_feature_groups_
+  // get num_dense_feature_groups_
   CountDenseFeatureGroups();
 
-  // LGBM_CUDA: initialize GPU
+  // initialize GPU
   CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_gpu_));
-  if (num_gpu_ > 1) num_gpu_ = 1;
+  if (num_gpu_ > 1) {
+    Log::Warning("CUDA doesn't support more than one GPU currently.");
+    num_gpu_ = 1;
+  }
   if (num_gpu_ > num_dense_feature_groups_) num_gpu_ = num_dense_feature_groups_;
 
-  // LGBM_CUDA: set cpu threads
-  cpu_threads_ = reinterpret_cast<pthread_t **>(malloc(sizeof(pthread_t *)*num_gpu_));
+  // set cpu threads
+  cpu_threads_ = reinterpret_cast<pthread_t **>(_mm_malloc(sizeof(pthread_t *)*num_gpu_, 16));
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    cpu_threads_[device_id] = reinterpret_cast<pthread_t *>(malloc(sizeof(pthread_t)));
+    cpu_threads_[device_id] = reinterpret_cast<pthread_t *>(_mm_malloc(sizeof(pthread_t), 16));
   }
 
-  // LGBM_CUDA: resize device memory pointers
+  // resize device memory pointers
   device_features_.resize(num_gpu_);
   device_gradients_.resize(num_gpu_);
   device_hessians_.resize(num_gpu_);
@@ -502,7 +503,7 @@ void CUDATreeLearner::InitGPU() {
   device_subhistograms_.resize(num_gpu_);
   device_histogram_outputs_.resize(num_gpu_);
 
-  // LGBM_CUDA: create stream & events to handle multiple GPUs
+  // create stream & events to handle multiple GPUs
   preallocd_max_num_wg_.resize(num_gpu_, 1024);
   stream_.resize(num_gpu_);
   hessians_future_.resize(num_gpu_);
@@ -539,7 +540,7 @@ Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians)
 }
 
 void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
-  // LGBM_CUDA: check data size
+  // check data size
   data_size_t old_allocated_num_data = allocated_num_data_;
 
   SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
@@ -555,7 +556,7 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_
   auto start_alloc_gpu_time = std::chrono::steady_clock::now();
   #endif
 
-  // LGBM_CUDA: AllocateGPUMemory only when the number of data increased
+  // AllocateGPUMemory only when the number of data increased
   int old_num_feature_groups = num_dense_feature_groups_;
   CountDenseFeatureGroups();
   if ((old_allocated_num_data < (num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature))) || (old_num_feature_groups < num_dense_feature_groups_)) {
@@ -594,7 +595,7 @@ void CUDATreeLearner::BeforeTrain() {
 
   SerialTreeLearner::BeforeTrain();
 
-  #if GPU_DEBUG >= 2
+  #if CUDA_DEBUG >= 2
   printf("CUDATreeLearner::BeforeTrain() Copying initial full gradients and hessians to device\n");
   #endif
 
@@ -750,7 +751,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync(
   // if not all feature groups are used, we need to transfer the feature mask to GPU
   // otherwise, we will use a specialized GPU kernel with all feature groups enabled
 
-  // LGBM_CUDA We now copy even if all features are used.
+  // We now copy even if all features are used.
   #pragma omp parallel for schedule(static, num_gpu_)
   for (int device_id = 0; device_id < num_gpu_; ++device_id) {
     int offset = offset_gpu_feature_groups_[device_id];
@@ -819,8 +820,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
   }
 
   // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
-  // #define GPU_DEBUG_COMPARE
-#ifdef GPU_DEBUG_COMPARE
+  // #define CUDA_DEBUG_COMPARE
+#ifdef CUDA_DEBUG_COMPARE
   printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n", num_dense_feature_groups_);
   bool compare = true;
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
@@ -927,7 +928,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_
 void CUDATreeLearner::FindBestSplits(const Tree* tree) {
   SerialTreeLearner::FindBestSplits(tree);
 
-#if GPU_DEBUG >= 3
+#if CUDA_DEBUG >= 3
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
     if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
     if (parent_leaf_histogram_array_ != nullptr
@@ -948,7 +949,7 @@ void CUDATreeLearner::FindBestSplits(const Tree* tree) {
 
 void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
   const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
-#if GPU_DEBUG >= 2
+#if CUDA_DEBUG >= 2
   printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
 #endif
   SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
@@ -957,12 +958,12 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ
     if (best_split_info.left_count < best_split_info.right_count) {
       if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) ||
           (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) {
-        Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
+        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
       }
     } else {
       if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
           (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
-        Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
+        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
       }
     }
   }
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index e8bc9d331f7..009e2471bb4 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -6,20 +6,22 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
 #define LIGHTGBM_TREELEARNET_CUDA_TREE_LEARNER_H_
 
-#include <cstdio>
-#include <vector>
-#include <random>
+#include <LightGBM/utils/random.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/feature_group.h>
+#include <LightGBM/tree.h>
+
+#include <string>
 #include <cmath>
+#include <cstdio>
 #include <memory>
+#include <random>
+#include <vector>
 #ifdef USE_CUDA
 #include <cuda_runtime.h>
 #endif
 
-#include <LightGBM/utils/random.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/dataset.h>
-#include <LightGBM/tree.h>
-#include <LightGBM/feature_group.h>
 #include "feature_histogram.hpp"
 #include "serial_tree_learner.h"
 #include "data_partition.hpp"
@@ -28,7 +30,7 @@
 
 #ifdef USE_CUDA
 #include <LightGBM/cuda/vector_cudahost.h>
-#include "cuda_kernel_launcher.h"  // LGBM_CUDA
+#include "cuda_kernel_launcher.h"
 
 
 using json11::Json;
@@ -75,24 +77,24 @@ class CUDATreeLearner: public SerialTreeLearner {
 
     /*!
      * \brief Initialize GPU device
-     * \LGBM_CUDA: param num_gpu: number of maximum gpus
+     * \param num_gpu: number of maximum gpus
      */
     void InitGPU();
 
     /*!
-     * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only
+     * \brief Allocate memory for GPU computation // alloc only
      */
     void CountDenseFeatureGroups();  // compute num_dense_feature_group
     void prevAllocateGPUMemory();  // compute CPU-side param calculation & Pin HostMemory
     void AllocateGPUMemory();
 
     /*!
-     * \ LGBM_CUDA: ResetGPUMemory
+     * \ ResetGPUMemory
      */
     void ResetGPUMemory();
 
     /*!
-     * \ LGBM_CUDA: copy dense feature from CPU to GPU
+     * \ copy dense feature from CPU to GPU
      */
     void copyDenseFeature();
 
@@ -160,7 +162,6 @@ class CUDATreeLearner: public SerialTreeLearner {
      *                     Set hessians to nullptr to skip copy to GPU.
      * \return true if GPU kernel is launched, false if GPU is not used
     */
-    // LGBM_CUDA v5.2
     bool ConstructGPUHistogramsAsync(
       const std::vector<int8_t>& is_feature_used,
       const data_size_t* data_indices, data_size_t num_data);
@@ -181,8 +182,8 @@ class CUDATreeLearner: public SerialTreeLearner {
     int num_feature_groups_;
     /*! \brief total number of dense feature-groups, which will be processed on GPU */
     int num_dense_feature_groups_;
-    std::vector<int> num_gpu_feature_groups_;  // LGBM_CUDA
-    std::vector<int> offset_gpu_feature_groups_;  // LGBM_CUDA
+    std::vector<int> num_gpu_feature_groups_;
+    std::vector<int> offset_gpu_feature_groups_;
     /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
      *  With bin size > 16, there are 4 features per DWORD.
      *  With bin size <=16, there are 8 features per DWORD.
@@ -203,66 +204,48 @@ class CUDATreeLearner: public SerialTreeLearner {
     std::vector<int> dense_feature_group_map_;
     /*! \brief Indices of all sparse feature-groups */
     std::vector<int> sparse_feature_group_map_;
-    /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */
-    // std::vector<int> device_bin_mults_;
     /*! \brief GPU memory object holding the training data */
-    // uint8_t *device_features_;
     std::vector<uint8_t*> device_features_;
     /*! \brief GPU memory object holding the ordered gradient */
-    // score_t *device_gradients_;
     std::vector<score_t*> device_gradients_;
     /*! \brief Pointer to pinned memory of ordered gradient */
     void * ptr_pinned_gradients_ = nullptr;
     /*! \brief GPU memory object holding the ordered hessian */
-    // score_t *device_hessians_;
     std::vector<score_t*> device_hessians_;
     /*! \brief Pointer to pinned memory of ordered hessian */
     void * ptr_pinned_hessians_ = nullptr;
     /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
-    // std::vector<char, CHAllocator<char>> feature_masks_;
     std::vector<char> feature_masks_;
     /*! \brief GPU memory object holding the feature masks */
-    // void *device_feature_masks_;
     std::vector<char*> device_feature_masks_;
     /*! \brief Pointer to pinned memory of feature masks */
     char* ptr_pinned_feature_masks_ = nullptr;
     /*! \brief GPU memory object holding indices of the leaf being processed */
-    // data_size_t *device_data_indices_;
     std::vector<data_size_t*> device_data_indices_;
     /*! \brief GPU memory object holding counters for workgroup coordination */
-    // int *sync_counters_;
     std::vector<int*> sync_counters_;
     /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
-    // char *device_subhistograms_;
     std::vector<char*> device_subhistograms_;
     /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
-    // void *device_histogram_outputs_;
     std::vector<void*> device_histogram_outputs_;
     /*! \brief Host memory pointer for histogram outputs */
     void *host_histogram_outputs_;
-    /*! \LGBM_CUDA: CUDA waitlist object for waiting for data transfer before kernel execution */
+    /*! CUDA waitlist object for waiting for data transfer before kernel execution */
     // cudaEvent_t kernel_wait_obj_;
     std::vector<cudaEvent_t> kernel_wait_obj_;
-    /*! \LGBM_CUDA: CUDA waitlist object for reading output histograms after kernel execution */
-    // cudaEvent_t histograms_wait_obj_;
+    /*! CUDA waitlist object for reading output histograms after kernel execution */
     std::vector<cudaEvent_t> histograms_wait_obj_;
-    /*! \LGBM_CUDA: CUDA Asynchronous waiting object for copying indices */
-    // cudaEvent_t indices_future_;
+    /*! CUDA Asynchronous waiting object for copying indices */
     std::vector<cudaEvent_t> indices_future_;
-    /*! \LGBM_CUDA: Asynchronous waiting object for copying gradients */
-    // cudaEvent_t gradients_future_;
+    /*! Asynchronous waiting object for copying gradients */
     std::vector<cudaEvent_t> gradients_future_;
-    /*! \LGBM_CUDA: Asynchronous waiting object for copying hessians */
-    // cudaEvent_t hessians_future_;
+    /*! Asynchronous waiting object for copying hessians */
     std::vector<cudaEvent_t> hessians_future_;
-    // LGBM_CUDA:\brief Asynchronous waiting object for copying dense features
-    // cudaEvent_t features_future_;
+    /*! Asynchronous waiting object for copying dense features */
     std::vector<cudaEvent_t> features_future_;
 
-    // LGBM_CUDA: host-side buffer for converting feature data into featre4 data
-    // std::vector<uint8_t*> host_vecs_;
+    // host-side buffer for converting feature data into featre4 data
     int nthreads_;  // number of Feature4* vector on host4_vecs_
-    // cudaEvent_t kernel_start_;  // event for kernel start
     std::vector<cudaEvent_t> kernel_start_;
     std::vector<float> kernel_time_;  // measure histogram kernel time
     std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 5c7cfdb4a9e..7ee72e5cda2 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -27,8 +27,7 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) {
 #ifdef IGNORE_INDICES
 #define KERNEL_NAME histogram16_fulldata
 #else  // IGNORE_INDICES
-#define KERNEL_NAME histogram16  // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
-// #define KERNEL_NAME histogram16_allfeats
+#define KERNEL_NAME histogram16
 #endif  // IGNORE_INDICES
 #else  // ENABLE_ALL_FEATURES
 #error "ENABLE_ALL_FEATURES should always be 1"
@@ -132,7 +131,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     __syncthreads();
     // gradient/hessian histograms
-    // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
+    // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
     // total size: 2 * 256 * size_of(float) = 2 KB
     // organization: each feature/grad/hessian is at a different bank,
     //               as indepedent of the feature value as possible
@@ -174,7 +173,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     if (!feature_mask) {
         return;
     } else {
-        feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+        feature_mask = feature_mask - 1;  // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
     }
 
     // STAGE 1: read feature data, and gradient and hessian
@@ -182,7 +181,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // We will prefetch data into the "next" variable at the beginning of each iteration
     uchar feature;
     uchar feature_next;
-    // uint8_t bin;
     uint16_t bin;
 
     feature = feature_data[ind >> feature_mask];
@@ -196,7 +194,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // store gradient and hessian
     score_t grad, hess;
     score_t grad_next, hess_next;
-    // LGBM_CUDA v5.1
     grad = ordered_gradients[ind];
     #if CONST_HESSIAN == 0
     hess = ordered_hessians[ind];
@@ -214,7 +211,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         ind_next = data_indices[i_next];
         #endif
 
-        // imbGBT v5.1
         grad_next = ordered_gradients[ind_next];
         #if CONST_HESSIAN == 0
         hess_next = ordered_hessians[ind_next];
@@ -252,7 +248,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         // STAGE 4: update next stat
         grad = grad_next;
         hess = hess_next;
-        // LGBM_CUDA: v4.2
         if (!feature_mask) {
             feature = feature_next;
         } else {
@@ -278,7 +273,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     #if CONST_HESSIAN == 1
     // make a final reduction
     gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position
+    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // counter move to this position
     __syncthreads();
     #endif
 
@@ -308,7 +303,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     // make sure everyone in this workgroup is here
     __syncthreads();
-    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+    // everyone in this workgroup: if we are the last workgroup, then do reduction!
     if (*counter_val == (1 << power_feature_workgroups) - 1) {
         if (ltid == 0) {
             sync_counters[feature_id] = 0;
@@ -318,7 +313,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     // only 1 work group, no need to increase counter
     // the reduction will become a simple copy
-    if (1) {
+    {
         unsigned int old_val;  // dummy
 #endif
         // locate our feature's block in output memory
@@ -450,7 +445,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     __syncthreads();
     // gradient/hessian histograms
-    // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
+    // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
     // total size: 2 * 256 * size_of(float) = 2 KB
     // organization: each feature/grad/hessian is at a different bank,
     //               as indepedent of the feature value as possible
@@ -491,7 +486,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     if (!feature_mask) {
         return;
     } else {
-        feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+        feature_mask = feature_mask - 1;  // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
     }
 
     // STAGE 1: read feature data, and gradient and hessian
@@ -499,7 +494,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // We will prefetch data into the "next" variable at the beginning of each iteration
     uchar feature;
     uchar feature_next;
-    // uint8_t bin;
     uint16_t bin;
 
     feature = feature_data[ind >> feature_mask];
@@ -513,7 +507,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // store gradient and hessian
     score_t grad, hess;
     score_t grad_next, hess_next;
-    // LGBM_CUDA v5.1
     grad = ordered_gradients[ind];
     #if CONST_HESSIAN == 0
     hess = ordered_hessians[ind];
@@ -531,7 +524,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         ind_next = data_indices[i_next];
         #endif
 
-        // imbGBT v5.1
         grad_next = ordered_gradients[ind_next];
         #if CONST_HESSIAN == 0
         hess_next = ordered_hessians[ind_next];
@@ -569,7 +561,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         // STAGE 4: update next stat
         grad = grad_next;
         hess = hess_next;
-        // LGBM_CUDA: v4.2
         if (!feature_mask) {
             feature = feature_next;
         } else {
@@ -594,7 +585,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     #if CONST_HESSIAN == 1
     // make a final reduction
     gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position
+    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // counter move to this position
     __syncthreads();
     #endif
 
@@ -624,7 +615,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     // make sure everyone in this workgroup is here
     __syncthreads();
-    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+    // everyone in this workgroup: if we are the last workgroup, then do reduction!
     if (*counter_val == (1 << power_feature_workgroups) - 1) {
         if (ltid == 0) {
             sync_counters[feature_id] = 0;
@@ -634,7 +625,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     // only 1 work group, no need to increase counter
     // the reduction will become a simple copy
-    if (1) {
+    {
         unsigned int old_val;  // dummy
 #endif
         // locate our feature's block in output memory
@@ -767,7 +758,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     __syncthreads();
     // gradient/hessian histograms
-    // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary??
+    // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
     // total size: 2 * 256 * size_of(float) = 2 KB
     // organization: each feature/grad/hessian is at a different bank,
     //               as indepedent of the feature value as possible
@@ -808,7 +799,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     if (!feature_mask) {
         return;
     } else {
-        feature_mask = feature_mask - 1;  // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
+        feature_mask = feature_mask - 1;  // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
     }
 
     // STAGE 1: read feature data, and gradient and hessian
@@ -816,7 +807,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // We will prefetch data into the "next" variable at the beginning of each iteration
     uchar feature;
     uchar feature_next;
-    // uint8_t bin;
     uint16_t bin;
 
     feature = feature_data[ind >> feature_mask];
@@ -830,7 +820,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     // store gradient and hessian
     score_t grad, hess;
     score_t grad_next, hess_next;
-    // LGBM_CUDA v5.1
     grad = ordered_gradients[ind];
     #if CONST_HESSIAN == 0
     hess = ordered_hessians[ind];
@@ -848,7 +837,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         ind_next = data_indices[i_next];
         #endif
 
-        // imbGBT v5.1
         grad_next = ordered_gradients[ind_next];
         #if CONST_HESSIAN == 0
         hess_next = ordered_hessians[ind_next];
@@ -885,7 +873,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         // STAGE 4: update next stat
         grad = grad_next;
         hess = hess_next;
-        // LGBM_CUDA: v4.2
         if (!feature_mask) {
             feature = feature_next;
         } else {
@@ -911,7 +898,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     #if CONST_HESSIAN == 1
     // make a final reduction
     gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // LGBM_CUDA: counter move to this position
+    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // counter move to this position
     __syncthreads();
     #endif
 
@@ -941,7 +928,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     // make sure everyone in this workgroup is here
     __syncthreads();
-    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+    // everyone in this workgroup: if we are the last workgroup, then do reduction!
     if (*counter_val == (1 << power_feature_workgroups) - 1) {
         if (ltid == 0) {
             sync_counters[feature_id] = 0;
@@ -951,7 +938,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
     }
     // only 1 work group, no need to increase counter
     // the reduction will become a simple copy
-    if (1) {
+    {
         unsigned int old_val;  // dummy
 #endif
         // locate our feature's block in output memory
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 4bb62d203a9..2001f2e0dfe 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -12,9 +12,9 @@
 #include <memory>
 #include <vector>
 
+#include "cuda_tree_learner.h"
 #include "gpu_tree_learner.h"
 #include "serial_tree_learner.h"
-#include "cuda_tree_learner.h"
 
 namespace LightGBM {
 
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index fc1de33e365..59ba770fb95 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -8,10 +8,10 @@
 #include <LightGBM/dataset.h>
 #include <LightGBM/tree.h>
 #include <LightGBM/tree_learner.h>
+#include <LightGBM/cuda/vector_cudahost.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/json11.h>
 #include <LightGBM/utils/random.h>
-#include <LightGBM/cuda/vector_cudahost.h>
 
 #include <string>
 #include <cmath>
diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index 63ca1b2de83..ab009a0b100 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -4,8 +4,8 @@
  */
 #include <LightGBM/tree_learner.h>
 
-#include "gpu_tree_learner.h"
 #include "cuda_tree_learner.h"
+#include "gpu_tree_learner.h"
 #include "parallel_tree_learner.h"
 #include "serial_tree_learner.h"
 

From a751bea68c3b40fedef3397f4e6942c01679744a Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 27 Aug 2020 12:32:39 +0000
Subject: [PATCH 114/119] Replace warning with Log message. Removal of some of
 the USE_CUDA. Fix typo and removal of pragma once.

---
 include/LightGBM/cuda/vector_cudahost.h | 6 +++---
 src/application/application.cpp         | 2 --
 src/boosting/gbdt.cpp                   | 4 ----
 src/io/config.cpp                       | 2 --
 src/io/dataset.cpp                      | 2 --
 src/treelearner/cuda_tree_learner.h     | 3 +--
 6 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index 60a82cc8391..0f2cf8081e5 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -33,8 +33,6 @@ class LGBM_config_ {
   static int current_learner;  // Default: use_cpu_learner
 };
 
-}  // namespace LightGBM
-
 
 template <class T>
 struct CHAllocator {
@@ -48,7 +46,7 @@ struct CHAllocator {
       if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) {
         cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
         if (ret != cudaSuccess) {
-          fprintf(stderr, "   TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr);
+          Log::Warning("Defaulting to malloc in CHAllocator!!!");
           ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
         }
       } else {
@@ -83,4 +81,6 @@ bool operator==(const CHAllocator<T>&, const CHAllocator<U>&);
 template <class T, class U>
 bool operator!=(const CHAllocator<T>&, const CHAllocator<U>&);
 
+}  // namespace LightGBM
+
 #endif  // LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
diff --git a/src/application/application.cpp b/src/application/application.cpp
index c62cdd711e0..43ba033881e 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -40,11 +40,9 @@ Application::Application(int argc, char** argv) {
     Log::Fatal("No training/prediction data, application quit");
   }
 
-#ifdef USE_CUDA
   if (config_.device_type == std::string("cuda")) {
       LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
   }
-#endif
 }
 
 Application::~Application() {
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 2c9fb3b734e..fcb7185a151 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -17,10 +17,8 @@
 
 namespace LightGBM {
 
-#ifdef USE_CUDA
 int LGBM_config_::current_device = lgbm_device_cpu;
 int LGBM_config_::current_learner = use_cpu_learner;
-#endif
 
 GBDT::GBDT()
     : iter_(0),
@@ -63,11 +61,9 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   es_first_metric_only_ = config_->first_metric_only;
   shrinkage_rate_ = config_->learning_rate;
 
-#ifdef USE_CUDA
   if (config_->device_type == std::string("cuda")) {
     LGBM_config_::current_learner = use_cuda_learner;
   }
-#endif
 
   // load forced_splits file
   if (!config->forcedsplits_filename.empty()) {
diff --git a/src/io/config.cpp b/src/io/config.cpp
index f0a9544e3b9..6e1872d0e76 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -209,11 +209,9 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
   GetMetricType(params, &metric);
   GetObjectiveType(params, &objective);
   GetDeviceType(params, &device_type);
-#ifdef USE_CUDA
   if (device_type == std::string("cuda")) {
     LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
   }
-#endif
   GetTreeLearnerType(params, &tree_learner);
 
   GetMembersFromString(params);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 2d9693bc695..90ba6a0eb58 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -337,7 +337,6 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
   auto features_in_group = NoGroup(used_features);
 
   auto is_sparse = io_config.is_enable_sparse;
-#ifdef USE_CUDA
   if (io_config.device_type == std::string("cuda")) {
       LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
       if (is_sparse) {
@@ -345,7 +344,6 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
       }
       is_sparse = false;
   }
-#endif
 
   std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
   if (io_config.enable_bundle && !used_features.empty()) {
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 009e2471bb4..b1fae65f5d1 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -2,9 +2,8 @@
  * Copyright (c) 2020 IBM Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#pragma once
 #ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-#define LIGHTGBM_TREELEARNET_CUDA_TREE_LEARNER_H_
+#define LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
 
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/array_args.h>

From 15eec67a3024786a59fbe6fea29c9283314aa81f Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 27 Aug 2020 13:03:13 +0000
Subject: [PATCH 115/119] Remove PRINT debug for CUDA code.

---
 src/treelearner/kernels/histogram_16_64_256.cu | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index 7ee72e5cda2..ccb399f4ecb 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -12,11 +12,6 @@
 
 namespace LightGBM {
 
-#define PRINT(b, t, fmt, ...) \
-if (b == gtid && t == ltid) { \
-  printf(fmt, __VA_ARGS__); \
-}
-
 // atomic add for float number in local memory
 inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) {
     atomicAdd(addr, static_cast<acc_type>(val));

From 1884dc20d31fbacf263aba3d6002d7d0f1974d0f Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 31 Aug 2020 11:41:48 +0000
Subject: [PATCH 116/119] Allow to use of multiple GPUs for CUDA.

---
 src/treelearner/cuda_tree_learner.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 813f99d4ba8..12aa722e1c9 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -481,10 +481,6 @@ void CUDATreeLearner::InitGPU() {
 
   // initialize GPU
   CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_gpu_));
-  if (num_gpu_ > 1) {
-    Log::Warning("CUDA doesn't support more than one GPU currently.");
-    num_gpu_ = 1;
-  }
   if (num_gpu_ > num_dense_feature_groups_) num_gpu_ = num_dense_feature_groups_;
 
   // set cpu threads

From 32f3a8dcea5efcc9e009918db1d5c75392e254ae Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 2 Sep 2020 13:52:41 +0000
Subject: [PATCH 117/119] More multi-GPUs enablement for CUDA.

---
 docs/Parameters.rst                   |  4 ++++
 include/LightGBM/config.h             |  4 ++++
 src/io/config_auto.cpp                |  5 +++++
 src/treelearner/cuda_tree_learner.cpp | 12 ++++++++----
 src/treelearner/cuda_tree_learner.h   |  2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 14d7a8098cf..4cbfed49104 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -1122,6 +1122,10 @@ GPU Parameters
 
    -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
 
+-  ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
+
+   -  number of gpus (CUDA implementation only)
+
 .. end params list
 
 Others
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index bfcb09a4004..25447abaf1a 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -968,6 +968,10 @@ struct Config {
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   bool gpu_use_dp = false;
 
+  // check = >0
+  // desc = number of gpus (CUDA implementation only)
+  int num_gpu = 1;
+
   #pragma endregion
 
   #pragma endregion
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index b14af67fd30..ad102020322 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -296,6 +296,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "gpu_platform_id",
   "gpu_device_id",
   "gpu_use_dp",
+  "num_gpu",
   });
   return params;
 }
@@ -611,6 +612,9 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
   GetInt(params, "gpu_device_id", &gpu_device_id);
 
   GetBool(params, "gpu_use_dp", &gpu_use_dp);
+
+  GetInt(params, "num_gpu", &num_gpu);
+  CHECK_GT(num_gpu, 0);
 }
 
 std::string Config::SaveMembersToString() const {
@@ -715,6 +719,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[gpu_platform_id: " << gpu_platform_id << "]\n";
   str_buf << "[gpu_device_id: " << gpu_device_id << "]\n";
   str_buf << "[gpu_use_dp: " << gpu_use_dp << "]\n";
+  str_buf << "[num_gpu: " << num_gpu << "]\n";
   return str_buf.str();
 }
 
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 12aa722e1c9..314494f4ef3 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -74,7 +74,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian)
   num_feature_groups_ = train_data_->num_feature_groups();
 
   // Initialize GPU buffers and kernels: get device info
-  InitGPU();
+  InitGPU(config_->num_gpu);
 }
 
 // some functions used for debugging the GPU histogram construction
@@ -433,7 +433,7 @@ void CUDATreeLearner::copyDenseFeature() {
 
 
 // InitGPU w/ num_gpu
-void CUDATreeLearner::InitGPU() {
+void CUDATreeLearner::InitGPU(int num_gpu) {
   // Get the max bin size, used for selecting best GPU kernel
   max_num_bin_ = 0;
 
@@ -479,9 +479,13 @@ void CUDATreeLearner::InitGPU() {
   // get num_dense_feature_groups_
   CountDenseFeatureGroups();
 
+  if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_;
+
   // initialize GPU
-  CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_gpu_));
-  if (num_gpu_ > num_dense_feature_groups_) num_gpu_ = num_dense_feature_groups_;
+  int gpu_count;
+
+  CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count));
+  num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu;
 
   // set cpu threads
   cpu_threads_ = reinterpret_cast<pthread_t **>(_mm_malloc(sizeof(pthread_t *)*num_gpu_, 16));
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index b1fae65f5d1..17c7d13d8a0 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -78,7 +78,7 @@ class CUDATreeLearner: public SerialTreeLearner {
      * \brief Initialize GPU device
      * \param num_gpu: number of maximum gpus
      */
-    void InitGPU();
+    void InitGPU(int num_gpu);
 
     /*!
      * \brief Allocate memory for GPU computation // alloc only

From ea537f88e1c2f0f3245c8fc31e34ae62e2621c7e Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 14 Sep 2020 12:39:58 +0000
Subject: [PATCH 118/119] More code cleanup based on reviews comments.

---
 CMakeLists.txt                                 |  4 ++--
 include/LightGBM/config.h                      |  5 +++--
 include/LightGBM/cuda/cuda_utils.h             |  2 +-
 include/LightGBM/cuda/vector_cudahost.h        |  4 ++--
 src/application/application.cpp                |  2 +-
 src/io/config.cpp                              |  2 +-
 src/io/dataset.cpp                             |  2 +-
 src/treelearner/cuda_tree_learner.cpp          |  4 ++--
 src/treelearner/cuda_tree_learner.h            | 10 ----------
 src/treelearner/kernels/histogram_16_64_256.cu |  6 ------
 10 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79870e8c54d..b2e206fe5fd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,8 +168,8 @@ if(USE_CUDA)
      -DIGNORE_INDICES
     )
 
-    message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES})
-    message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES})
+    message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}")
+    message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}")
 
     function(add_histogram hsize hname hadd hconst hdir)
       add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu)
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 25447abaf1a..5e190261390 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -965,11 +965,12 @@ struct Config {
   // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
   int gpu_device_id = -1;
 
-  // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
+  // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used in OpenCL implementation and double precision is used in CUDA implementation)
   bool gpu_use_dp = false;
 
   // check = >0
-  // desc = number of gpus (CUDA implementation only)
+  // desc = number of GPUs
+  // desc = **Note**: can be used only in CUDA implementation
   int num_gpu = 1;
 
   #pragma endregion
diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index b94b12d1c92..1054e09daf1 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -14,7 +14,7 @@
 #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
   if (code != cudaSuccess) {
-    LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line);
+    LightGBM::Log::Fatal("[CUDA] %s %s %d\n", cudaGetErrorString(code), file, line);
     if (abort) exit(code);
   }
 }
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index 0f2cf8081e5..f81cc4dd905 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -43,7 +43,7 @@ struct CHAllocator {
     T* ptr;
     if (n == 0) return NULL;
     #ifdef USE_CUDA
-      if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) {
+      if (LGBM_config_::current_device == lgbm_device_cuda) {
         cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
         if (ret != cudaSuccess) {
           Log::Warning("Defaulting to malloc in CHAllocator!!!");
@@ -62,7 +62,7 @@ struct CHAllocator {
     (void)n;  // UNUSED
     if (p == NULL) return;
     #ifdef USE_CUDA
-      if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) {
+      if (LGBM_config_::current_device == lgbm_device_cuda) {
         cudaPointerAttributes attributes;
         cudaPointerGetAttributes(&attributes, p);
         if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
diff --git a/src/application/application.cpp b/src/application/application.cpp
index 43ba033881e..d9be76d67c9 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -41,7 +41,7 @@ Application::Application(int argc, char** argv) {
   }
 
   if (config_.device_type == std::string("cuda")) {
-      LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
+      LGBM_config_::current_device = lgbm_device_cuda;
   }
 }
 
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 6e1872d0e76..6878896deb5 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -210,7 +210,7 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
   GetObjectiveType(params, &objective);
   GetDeviceType(params, &device_type);
   if (device_type == std::string("cuda")) {
-    LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
+    LGBM_config_::current_device = lgbm_device_cuda;
   }
   GetTreeLearnerType(params, &tree_learner);
 
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 90ba6a0eb58..fce7cfa2bb2 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -338,7 +338,7 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
 
   auto is_sparse = io_config.is_enable_sparse;
   if (io_config.device_type == std::string("cuda")) {
-      LightGBM::LGBM_config_::current_device = lgbm_device_cuda;
+      LGBM_config_::current_device = lgbm_device_cuda;
       if (is_sparse) {
         Log::Warning("Using sparse features with CUDA is currently not supported.");
       }
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 314494f4ef3..16569eef257 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -309,7 +309,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() {
   // histogram bin entry size depends on the precision (single/double)
   hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t));  // two elements in this "size"
 
-  CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast<void **>(&host_histogram_outputs_), (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable));
+  CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast<void **>(&host_histogram_outputs_), static_cast<size_t>(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable));
 
   nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_);
   nthreads_ = std::max(nthreads_, 1);
@@ -485,7 +485,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   int gpu_count;
 
   CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count));
-  num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu;
+  num_gpu_ = (gpu_count < num_gpu) ? gpu_count : num_gpu;
 
   // set cpu threads
   cpu_threads_ = reinterpret_cast<pthread_t **>(_mm_malloc(sizeof(pthread_t *)*num_gpu_, 16));
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
index 17c7d13d8a0..442c2f53ea0 100644
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@@ -153,12 +153,6 @@ class CUDATreeLearner: public SerialTreeLearner {
      * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
      *                     Set to nullptr to skip copy to GPU.
      * \param num_data Number of data examples to be included in histogram
-     * \param gradients Array of gradients for all examples.
-     * \param hessians Array of hessians for all examples.
-     * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, 
-     *                     Set gradients to nullptr to skip copy to GPU.
-     * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, 
-     *                     Set hessians to nullptr to skip copy to GPU.
      * \return true if GPU kernel is launched, false if GPU is not used
     */
     bool ConstructGPUHistogramsAsync(
@@ -188,9 +182,6 @@ class CUDATreeLearner: public SerialTreeLearner {
      *  With bin size <=16, there are 8 features per DWORD.
      */
     int dword_features_;
-    /*! \brief total number of dense feature-group tuples on GPU.
-     * Each feature tuple is 4-byte (4 features if each feature takes a byte) */
-    // int num_dense_feature4_;
     /*! \brief Max number of bins of training data, used to determine 
      * which GPU kernel to use */
     int max_num_bin_;
@@ -230,7 +221,6 @@ class CUDATreeLearner: public SerialTreeLearner {
     /*! \brief Host memory pointer for histogram outputs */
     void *host_histogram_outputs_;
     /*! CUDA waitlist object for waiting for data transfer before kernel execution */
-    // cudaEvent_t kernel_wait_obj_;
     std::vector<cudaEvent_t> kernel_wait_obj_;
     /*! CUDA waitlist object for reading output histograms after kernel execution */
     std::vector<cudaEvent_t> histograms_wait_obj_;
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
index ccb399f4ecb..105ccbb6203 100644
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
@@ -303,7 +303,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         if (ltid == 0) {
             sync_counters[feature_id] = 0;
         }
-    // }
 #else
     }
     // only 1 work group, no need to increase counter
@@ -316,7 +315,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         acc_type const * __restrict__ feature_subhists =
                  reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
         // skip reading the data already in local memory
-        // unsigned int skip_id = feature_id ^ output_offset;
         unsigned int skip_id = group_id - output_offset;
         // locate output histogram location for this feature4
         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
@@ -615,7 +613,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         if (ltid == 0) {
             sync_counters[feature_id] = 0;
         }
-    // }
 #else
     }
     // only 1 work group, no need to increase counter
@@ -628,7 +625,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         acc_type const * __restrict__ feature_subhists =
                  reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
         // skip reading the data already in local memory
-        // unsigned int skip_id = feature_id ^ output_offset;
         unsigned int skip_id = group_id - output_offset;
         // locate output histogram location for this feature4
         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
@@ -928,7 +924,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         if (ltid == 0) {
             sync_counters[feature_id] = 0;
         }
-    // }
 #else
     }
     // only 1 work group, no need to increase counter
@@ -941,7 +936,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
         acc_type const * __restrict__ feature_subhists =
                  reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
         // skip reading the data already in local memory
-        // unsigned int skip_id = feature_id ^ output_offset;
         unsigned int skip_id = group_id - output_offset;
         // locate output histogram location for this feature4
         acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;

From d9e9d2e0df9f75aed79af0b88bf24a9300479dd6 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 14 Sep 2020 13:54:26 +0000
Subject: [PATCH 119/119] Update docs with latest config changes.

---
 docs/Parameters.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 4cbfed49104..dcd1353e152 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -1120,11 +1120,13 @@ GPU Parameters
 
 -  ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
 
-   -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
+   -  set this to ``true`` to use double precision math on GPU (by default single precision is used in OpenCL implementation and double precision is used in CUDA implementation)
 
 -  ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
 
-   -  number of gpus (CUDA implementation only)
+   -  number of GPUs
+
+   -  **Note**: can be used only in CUDA implementation
 
 .. end params list