From 328a9f0746a5a91aa0747d55bc5758a4efa29d12 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 30 Mar 2020 16:11:27 +0000 Subject: [PATCH 001/119] Initial CUDA work --- CMakeLists.txt | 130 ++++++++++- include/LightGBM/application.h | 4 + include/LightGBM/bin.h | 4 + include/LightGBM/config.h | 4 + include/LightGBM/dataset.h | 15 +- include/LightGBM/feature_group.h | 10 + include/LightGBM/tree_learner.h | 3 +- python-package/setup.py | 2 +- src/application/application.cpp | 16 ++ src/boosting/gbdt.cpp | 206 +++++++++++++++++- src/boosting/gbdt.h | 27 ++- src/c_api.cpp | 199 +++++++++++++++++ src/io/config.cpp | 4 +- src/io/config_auto.cpp | 12 + src/io/dataset.cpp | 49 ++++- src/io/dense_bin.hpp | 13 ++ src/io/sparse_bin.hpp | 3 + src/main.cpp | 7 + .../data_parallel_tree_learner.cpp | 5 +- src/treelearner/data_partition.hpp | 2 + .../feature_parallel_tree_learner.cpp | 7 +- src/treelearner/parallel_tree_learner.h | 6 +- src/treelearner/serial_tree_learner.cpp | 14 +- src/treelearner/serial_tree_learner.h | 13 +- src/treelearner/tree_learner.cpp | 11 + .../voting_parallel_tree_learner.cpp | 5 +- 26 files changed, 731 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a07c3fe79b6..4d81828b640 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,20 @@ +#LGBM_CUDA Added USE_CUDA flag if(USE_GPU OR APPLE) cmake_minimum_required(VERSION 3.2) +elseif(USE_CUDA) + cmake_minimum_required(VERSION 3.11) + enable_language(CUDA) else() cmake_minimum_required(VERSION 2.8) endif() +#LGBM_CUDA +if(USE_CUDA) + PROJECT(lightgbm LANGUAGES C CXX CUDA) +else() + PROJECT(lightgbm LANGUAGES C CXX) +endif() + PROJECT(lightgbm) OPTION(USE_MPI "Enable MPI-based parallel learning" OFF) @@ -12,7 +23,7 @@ OPTION(USE_GPU "Enable GPU-accelerated training" OFF) OPTION(USE_SWIG "Enable SWIG to generate Java API" OFF) OPTION(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF) OPTION(USE_R35 "Set to ON if your R version is not earlier than 3.5" OFF) -OPTION(USE_TIMETAG "Set to ON to output time costs" OFF) +OPTION(USE_CUDA "Enable CUDA-accelerated training" OFF) # LGBM_CUDA OPTION(USE_DEBUG "Set to ON for Debug mode" OFF) OPTION(BUILD_FOR_R "Set to ON if building lib_lightgbm for use with the R package" OFF) @@ -127,6 +138,101 @@ if(USE_GPU) ADD_DEFINITIONS(-DUSE_GPU) endif(USE_GPU) +#LGBM_CUDA CUDA-specific code +if(USE_CUDA) + find_package(CUDA REQUIRED) + include_directories(${CUDA_INCLUDE_DIRS}) + LIST(APPEND CMAKE_CUDA_FLAGS -g -Xcompiler=-fopenmp -Xcompiler=-fPIC -Xcompiler=-Wall -lineinfo) + CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS 7.0) + + LIST(APPEND CMAKE_CUDA_FLAGS ${CUDA_ARCH_FLAGS}) + if(CMAKE_BUILD_TYPE MATCHES Release) + LIST(APPEND CMAKE_CUDA_FLAGS -03) + endif() + + message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") + string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") + message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") + set(CMAKE_CUDA_FLAGS_DEBUG -G) + set(CMAKE_CUDA_FLAGS_RELEASE -lineinfo) + + ADD_DEFINITIONS(-DUSE_CUDA) + if (NOT DEFINED CMAKE_CUDA_STANDARD) + set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + endif() + + set(BASE_DEFINES + -DPOWER_FEATURE_WORKGROUPS=12 + -DUSE_CONSTANT_BUF=0 + ) + + set(ALLFEATS_DEFINES + ${BASE_DEFINES} + -DENABLE_ALL_FEATURES + ) + + set(FULLDATA_DEFINES + ${ALLFEATS_DEFINES} + -DIGNORE_INDICES + ) + + #string(REPLACE ";" " " BASE_DEFINES "${BASE_DEFINES}") + #string(REPLACE ";" " " ALLFEATS_DEFINES "${ALLFEATS_DEFINES}") + #string(REPLACE ";" " " FULLDATA_DEFINES "${FULLDATA_DEFINES}") + + message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}) + message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES}) + + add_library(histo256_sp_const OBJECT src/treelearner/kernels/histogram256.cu) + set_target_properties(histo256_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_definitions( + histo256_sp_const PRIVATE + -DCONST_HESSIAN=1 + ${BASE_DEFINES} + ) + + add_library(histo256_sp OBJECT src/treelearner/kernels/histogram256.cu) + set_target_properties(histo256_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_definitions( + histo256_sp PRIVATE + -DCONST_HESSIAN=0 + ${BASE_DEFINES} + ) + + add_library(histo256-allfeats_sp_const OBJECT src/treelearner/kernels/histogram256.cu) + set_target_properties(histo256-allfeats_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_definitions( + histo256-allfeats_sp_const PRIVATE + -DCONST_HESSIAN=1 + ${ALLFEATS_DEFINES} + ) + + add_library(histo256-allfeats_sp OBJECT src/treelearner/kernels/histogram256.cu) + set_target_properties(histo256-allfeats_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_definitions( + histo256-allfeats_sp PRIVATE + -DCONST_HESSIAN=0 + ${ALLFEATS_DEFINES} + ) + + add_library(histo256-fulldata_sp_const OBJECT src/treelearner/kernels/histogram256.cu) + set_target_properties(histo256-fulldata_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_definitions( + histo256-fulldata_sp_const PRIVATE + -DCONST_HESSIAN=1 + ${FULLDATA_DEFINES} + ) + + add_library(histo256-fulldata_sp OBJECT src/treelearner/kernels/histogram256.cu) + set_target_properties(histo256-fulldata_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_definitions( + histo256-fulldata_sp PRIVATE + -DCONST_HESSIAN=0 + ${FULLDATA_DEFINES} + ) +endif(USE_CUDA) + if(USE_HDFS) find_package(JNI REQUIRED) find_path(HDFS_INCLUDE_DIR hdfs.h REQUIRED) @@ -224,6 +330,7 @@ if(USE_MPI) include_directories(${MPI_CXX_INCLUDE_PATH}) endif(USE_MPI) +#LGBM_CUDA file(GLOB SOURCES src/application/*.cpp src/boosting/*.cpp @@ -232,6 +339,9 @@ file(GLOB SOURCES src/objective/*.cpp src/network/*.cpp src/treelearner/*.cpp +#ifdef USE_CUDA + src/treelearner/*cu +#endif ) add_executable(lightgbm src/main.cpp ${SOURCES}) @@ -303,6 +413,24 @@ if(USE_GPU) TARGET_LINK_LIBRARIES(_lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES}) endif(USE_GPU) +#LGBM_CUDA +if(USE_CUDA) + TARGET_LINK_LIBRARIES( + lightgbm + histo256_sp_const + histo256_sp + histo256-fulldata_sp_const + histo256-fulldata_sp + ) + TARGET_LINK_LIBRARIES( + _lightgbm + histo256_sp_const + histo256_sp + histo256-fulldata_sp_const + histo256-fulldata_sp + ) +endif(USE_CUDA) + if(USE_HDFS) TARGET_LINK_LIBRARIES(lightgbm ${HDFS_CXX_LIBRARIES}) TARGET_LINK_LIBRARIES(_lightgbm ${HDFS_CXX_LIBRARIES}) diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h index 66541ec006c..911dedd7d94 100644 --- a/include/LightGBM/application.h +++ b/include/LightGBM/application.h @@ -36,6 +36,10 @@ class Application { /*! \brief To call this function to run application*/ inline void Run(); + // LGBM_CUDA + /*! \brief call to get configuration */ + Config GetConfig() {return config_ ;} ; + private: /*! \brief Load parameters from command line and config file*/ void LoadParameters(int argc, char** argv); diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index f817dfabaa8..e541e7039e9 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -288,6 +288,10 @@ class Bin { /*! \brief Number of all data */ virtual data_size_t num_data() const = 0; + // LGBM_CUDA + /*! \brief Get data pointer */ + virtual void* get_data() = 0; + virtual void ReSize(data_size_t num_data) = 0; /*! diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 2a3335c1c0a..9622814832b 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -954,6 +954,10 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) bool gpu_use_dp = false; + // desc = number of gpus (CUDA implementation only) LGBM_CUDA + // desc = default value is 1 + int num_gpu = 1; + #pragma endregion #pragma endregion diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 802b44b9fc2..e4c5dc56511 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -440,7 +440,10 @@ class Dataset { } return ret; } - void ReSize(data_size_t num_data); + + /* LGBM_CUDA void ReSize(data_size_t num_data); */ + // LGBM_CUDA ReSize() returns true if resized + bool ReSize(data_size_t num_data); void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data); @@ -589,6 +592,16 @@ class Dataset { return feature_groups_[i]->is_multi_val_; } + // LGBM_CUDA + inline size_t FeatureGroupSizesInByte(int group) const { + return feature_groups_[group]->FeatureGroupSizesInByte(); + } + + // LGBM_CUDA + inline void* FeatureGroupData(int group) const { + return feature_groups_[group]->FeatureGroupData(); + } + inline double RealThreshold(int i, uint32_t threshold) const { const int group = feature2group_[i]; const int sub_feature = feature2subfeature_[i]; diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 2b17e98bb9c..d949beec20e 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -228,6 +228,16 @@ class FeatureGroup { return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); } + // LGBM_CUDA + inline size_t FeatureGroupSizesInByte() { + return bin_data_->SizesInByte(); + } + + // LGBM_CUDA + inline void* FeatureGroupData() { + return bin_data_->get_data(); + } + inline data_size_t Split(int sub_feature, const uint32_t* threshold, int num_threshold, bool default_left, const data_size_t* data_indices, data_size_t cnt, diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index e0fb3489057..3bc246e8426 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -34,7 +34,8 @@ class TreeLearner { * \param train_data The used training data * \param is_constant_hessian True if all hessians share the same value */ - virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0; + // LGBM_CUDA is_use_subset_ for CUDA + virtual void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) = 0; virtual void ResetIsConstantHessian(bool is_constant_hessian) = 0; diff --git a/python-package/setup.py b/python-package/setup.py index 73f123baf42..9d8853ddf94 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -212,7 +212,7 @@ def initialize_options(self): self.opencl_library = None self.mpi = 0 self.hdfs = 0 - self.precompile = 0 + self.precompile = 1 self.nomp = 0 self.bit32 = 0 diff --git a/src/application/application.cpp b/src/application/application.cpp index 72e7e489f9b..1b9eabf8a12 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -25,6 +25,10 @@ #include "predictor.hpp" +#ifdef USE_CUDA +#include +#endif + namespace LightGBM { Common::Timer global_timer; @@ -38,6 +42,18 @@ Application::Application(int argc, char** argv) { if (config_.data.size() == 0 && config_.task != TaskType::kConvertModel) { Log::Fatal("No training/prediction data, application quit"); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config_.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;} + if (config_.bagging_freq == 0) {config_.bagging_freq = 1;} + } +#endif + } Application::~Application() { diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 7871bbfb086..5f7aac08640 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -17,6 +17,11 @@ namespace LightGBM { +#ifdef USE_CUDA +int LGBM_config_::current_device=lgbm_device_cpu; +int LGBM_config_::current_learner=use_cpu_learner; +#endif + GBDT::GBDT() : iter_(0), train_data_(nullptr), @@ -58,6 +63,19 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective es_first_metric_only_ = config_->first_metric_only; shrinkage_rate_ = config_->learning_rate; +// LGBM_CUDA +#ifdef USE_CUDA + if (config_->device_type == std::string("cuda")) { + // LGBM_config_::current_device=lgbm_device_cuda; moved to application.cpp + LGBM_config_::current_learner=use_cuda_learner; + + /* Following are needed to ensure bagging required by the CUDA implementation */ +// if (config_->bagging_fraction == 1.0){config_->bagging_fraction = 0.8;} moved to application.cpp +// if (config_->bagging_freq == 0) {config_->bagging_freq = 1;} moved to application.cpp + + } +#endif + // load forced_splits file if (!config->forcedsplits_filename.empty()) { std::ifstream forced_splits_file(config->forcedsplits_filename.c_str()); @@ -107,8 +125,23 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective monotone_constraints_ = config->monotone_constraints; // if need bagging, create buffer + // LGBM_CUDA: for CUDA implementation, this function sets up the is_use_subset_ flag as TRUE and tmp_subset_ is allocated. ResetBaggingConfig(config_.get(), true); + // LGBM_CUDA + // Two key changes: position of the initializer is moved from the original code, and init() uses is_use_subset_ flag + tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get())); + + // init tree learner + // LGBM_CUDA do not copy feature is is_use_subset for initialization + // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_) + + if (config_->device_type == std::string("cuda")) { + tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_); + } else { + tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_); + } + class_need_train_ = std::vector(num_tree_per_iteration_, true); if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) { CHECK_EQ(num_tree_per_iteration_, num_class_); @@ -231,13 +264,19 @@ void GBDT::Bagging(int iter) { // set bagging data to tree learner if (!is_use_subset_) { tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); - } else { - // get subset - tmp_subset_->ReSize(bag_data_cnt_); - tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), - bag_data_cnt_, false); - tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), - bag_data_cnt_); + } else { // LGBM_CUDA + // NEW get subset + bool resized= tmp_subset_->ReSize(bag_data_cnt_); + + if (resized && (config_->device_type == std::string("cuda"))) { + size_t bag_gh_size = static_cast(bag_data_cnt_) * num_tree_per_iteration_; + tmp_gradients_.resize(bag_gh_size); + tmp_hessians_.resize(bag_gh_size); + } + + tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); + + tree_learner_->ResetTrainingData(tmp_subset_.get()); } } } @@ -245,13 +284,18 @@ void GBDT::Bagging(int iter) { void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { Common::FunctionTimer fun_timer("GBDT::Train", global_timer); bool is_finished = false; + + //LGBM_CUDA auto start_time = std::chrono::steady_clock::now(); + for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) { is_finished = TrainOneIter(nullptr, nullptr); if (!is_finished) { is_finished = EvalAndCheckEarlyStopping(); } + auto end_time = std::chrono::steady_clock::now(); + // output used time per iteration Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration(end_time - start_time) * 1e-3, iter + 1); @@ -334,7 +378,136 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) { return 0.0f; } +// LGBM_CUDA +bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { + + // LGBM_CUDA invoke baggging during the first iteration + if ((config_->device_type == std::string("cuda")) && (iter_ == 0)) { + + auto start_time = std::chrono::steady_clock::now(); + + Bagging(0); + } + + std::vector init_scores(num_tree_per_iteration_, 0.0); + + // boosting first + if (gradients == nullptr || hessians == nullptr) { + for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { + init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); + } + + // LGBM_CUDA + auto start_time = std::chrono::steady_clock::now(); + + Boosting(); + + gradients = gradients_.data(); + hessians = hessians_.data(); + } + + // LGBM_CUDA bagging logic + // Bagging(iter_); + + bool should_continue = false; + for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { + + // LGBM_CUDA + auto start_time = std::chrono::steady_clock::now(); + + const size_t offset = static_cast(cur_tree_id) * num_data_; + std::unique_ptr new_tree(new Tree(2)); + + if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) { + + auto grad = gradients + offset; + auto hess = hessians + offset; + + // LGBM_CUDA + auto tmp_grad = tmp_gradients_.data(); + auto tmp_hess = tmp_hessians_.data(); + + // need to copy gradients for bagging subset. + if (is_use_subset_ && bag_data_cnt_ < num_data_) { + + #pragma omp parallel for schedule(static) // LGBM_CUDA + for (int i = 0; i < bag_data_cnt_; ++i) { + tmp_grad[i] = grad[bag_data_indices_[i]]; // LGBM_CUDA + tmp_hess[i] = hess[bag_data_indices_[i]]; // LGBM_CUDA + } + } + + // LGBM_CUDA + new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_)); + } + + if (new_tree->num_leaves() > 1) { + should_continue = true; + auto score_ptr = train_score_updater_->score() + offset; + auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast(label[i]) - score_ptr[i]; }; + tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter, + num_data_, bag_data_indices_.data(), bag_data_cnt_); + // shrinkage by learning rate + new_tree->Shrinkage(shrinkage_rate_); + // update score + UpdateScore(new_tree.get(), cur_tree_id); + if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) { + new_tree->AddBias(init_scores[cur_tree_id]); + } + } else { + // only add default score one-time + if (models_.size() < static_cast(num_tree_per_iteration_)) { + double output = 0.0; + if (!class_need_train_[cur_tree_id]) { + if (objective_function_ != nullptr) { + output = objective_function_->BoostFromScore(cur_tree_id); + } + } else { + output = init_scores[cur_tree_id]; + } + new_tree->AsConstantTree(output); + // updates scores + train_score_updater_->AddScore(output, cur_tree_id); + for (auto& score_updater : valid_score_updater_) { + score_updater->AddScore(output, cur_tree_id); + } + } + + // LGBM_CUDA: moved for overlapping data copy w/ other operations + int iter_next = iter_ + 1; + if (iter_next < config_->num_iterations) { + + auto start_time = std::chrono::steady_clock::now(); + + // bagging logic + Bagging(iter_next); + + } + } + // add model + models_.push_back(std::move(new_tree)); + } + + if (!should_continue) { + Log::Warning("Stopped training because there are no more leaves that meet the split requirements"); + if (models_.size() > static_cast(num_tree_per_iteration_)) { + for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { + models_.pop_back(); + } + } + return true; + } + + ++iter_; + return false; +} + bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { + + if (config_->device_type == std::string("cuda")){ //LGBM_CUDA + return TrainOneIterCUDA(gradients, hessians); + } + Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer); std::vector init_scores(num_tree_per_iteration_, 0.0); // boosting first @@ -786,9 +959,22 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { } } else { bag_data_cnt_ = num_data_; - bag_data_indices_.clear(); - bagging_runner_.ReSize(0); - is_use_subset_ = false; + if (config_->device_type == std::string("cuda")){ // LGBM_CUDA + if (tmp_subset_ == nullptr){ + tmp_subset_.reset(new Dataset(bag_data_cnt_)); + tmp_subset_->CopyFeatureMapperFrom(train_data_); + size_t bag_gh_size = static_cast(bag_data_cnt_) * num_tree_per_iteration_; + tmp_gradients_.resize(bag_gh_size); + tmp_hessians_.resize(bag_gh_size); + is_use_subset_ = false; + bag_data_indices_.clear(); + } + } + else { + bag_data_indices_.clear(); + bagging_runner_.ReSize(0); + is_use_subset_ = false; + } } } diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index f3ece67fec0..d460894d44b 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -22,6 +22,10 @@ #include #include +#ifdef USE_CUDA +#include //LGBM_CUDA +#endif + #include "score_updater.hpp" namespace LightGBM { @@ -144,6 +148,14 @@ class GBDT : public GBDTBase { */ bool TrainOneIter(const score_t* gradients, const score_t* hessians) override; + /*! + * \brief Training logic + * \param gradients nullptr for using default objective, otherwise use self-defined boosting + * \param hessians nullptr for using default objective, otherwise use self-defined boosting + * \return True if cannot train any more + */ + bool TrainOneIterCUDA(const score_t* gradients, const score_t* hessians); // LGBM_CUDA + /*! * \brief Rollback one iteration */ @@ -463,10 +475,23 @@ class GBDT : public GBDTBase { std::vector> models_; /*! \brief Max feature index of training data*/ int max_feature_idx_; + +#ifdef USE_CUDA + /*! \brief First order derivative of training data */ + std::vector> gradients_; // LGBM_CUDA + std::vector> tmp_gradients_; // LGBM_CUDA + /*! \brief Second order derivative of training data */ + std::vector> hessians_; // LGBM_CUDA + std::vector> tmp_hessians_; // LGBM_CUDA +#else /*! \brief First order derivative of training data */ std::vector> gradients_; - /*! \brief Secend order derivative of training data */ + std::vector> tmp_gradients_; + /*! \brief Second order derivative of training data */ std::vector> hessians_; + std::vector> tmp_hessians_; +#endif + /*! \brief Store the indices of in-bag data */ std::vector> bag_data_indices_; /*! \brief Number of in-bag data */ diff --git a/src/c_api.cpp b/src/c_api.cpp index 290f219fa63..f785bc74f19 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -18,6 +18,10 @@ #include #include +#ifdef USE_CUDA +#include +#endif + #include #include #include @@ -110,6 +114,22 @@ class Booster { if (config_.num_threads > 0) { omp_set_num_threads(config_.num_threads); } + +#ifdef USE_CUDA + // Only use CUDA when the data is large (2048 == 256 bins each with at least 8 elements) + if (train_data->num_data() < 2048){ + config_.device_type = std::string("cpu"); + } + + if (config_.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;} + if (config_.bagging_freq == 0) {config_.bagging_freq = 1;} + } +#endif + // create boosting if (config_.input_model.size() > 0) { Log::Warning("Continued train from model is not supported for c_api,\n" @@ -303,6 +323,17 @@ class Booster { omp_set_num_threads(config_.num_threads); } +//LGBM_CUDA +#ifdef USE_CUDA + if (config_.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;} + if (config_.bagging_freq == 0) {config_.bagging_freq = 1;} + } +#endif + if (param.count("objective")) { // create objective function objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective, @@ -627,6 +658,18 @@ int LGBM_DatasetCreateFromFile(const char* filename, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + DatasetLoader loader(config, nullptr, 1, filename); if (reference == nullptr) { if (Network::num_machines() == 1) { @@ -657,6 +700,18 @@ int LGBM_DatasetCreateFromSampledColumn(double** sample_data, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + DatasetLoader loader(config, nullptr, 1, nullptr); *out = loader.CostructFromSampleData(sample_data, sample_indices, ncol, num_per_col, num_sample_row, @@ -768,6 +823,18 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + std::unique_ptr ret; int32_t total_nrow = 0; for (int j = 0; j < nmat; ++j) { @@ -859,6 +926,18 @@ int LGBM_DatasetCreateFromCSR(const void* indptr, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + std::unique_ptr ret; auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); int32_t nrow = static_cast(nindptr - 1); @@ -926,6 +1005,18 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + std::unique_ptr ret; int32_t nrow = num_rows; if (reference == nullptr) { @@ -997,6 +1088,18 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + std::unique_ptr ret; int32_t nrow = static_cast(num_row); if (reference == nullptr) { @@ -1080,6 +1183,18 @@ int LGBM_DatasetGetSubset( if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + auto full_dataset = reinterpret_cast(handle); CHECK_GT(num_used_row_indices, 0); const int32_t lower = 0; @@ -1475,6 +1590,18 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + Booster* ref_booster = reinterpret_cast(handle); ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header, config, result_filename); @@ -1519,6 +1646,18 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); int nrow = static_cast(nindptr - 1); @@ -1553,6 +1692,18 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); ref_booster->PredictSingleRow(num_iteration, predict_type, static_cast(num_col), get_row_fun, config, out_result, out_len); @@ -1582,6 +1733,18 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + int num_threads = OMP_NUM_THREADS(); int ncol = static_cast(ncol_ptr - 1); std::vector> iterators(num_threads, std::vector()); @@ -1626,6 +1789,18 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major); ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, @@ -1650,6 +1825,18 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major); ref_booster->PredictSingleRow(num_iteration, predict_type, ncol, get_row_fun, config, out_result, out_len); @@ -1674,6 +1861,18 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } + +//LGBM_CUDA +#ifdef USE_CUDA + if (config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} + if (config.bagging_freq == 0) {config.bagging_freq = 1;} + } +#endif + Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type); ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len); diff --git a/src/io/config.cpp b/src/io/config.cpp index d569a7401e1..5d2faba6133 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -126,6 +126,8 @@ void GetDeviceType(const std::unordered_map& params, s *device_type = "cpu"; } else if (value == std::string("gpu")) { *device_type = "gpu"; + } else if (value == std::string("cuda")) { // LGBM_CUDA + *device_type = "cuda"; } else { Log::Fatal("Unknown device type %s", value.c_str()); } @@ -320,7 +322,7 @@ void Config::CheckParamConflict() { } } // force col-wise for gpu - if (device_type == std::string("gpu")) { + if (device_type == std::string("gpu")) { // GCF maybe need to add some cuda here? force_col_wise = true; force_row_wise = false; } diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 807cad78502..46d95b0df8f 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -294,6 +294,7 @@ const std::unordered_set& Config::parameter_set() { "gpu_platform_id", "gpu_device_id", "gpu_use_dp", + "num_gpu", /* LGBM_CUDA */ }); return params; } @@ -482,6 +483,11 @@ void Config::GetMembersFromString(const std::unordered_map 0); + } std::string Config::SaveMembersToString() const { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 6e17eeb8917..a020f425f3a 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -10,6 +10,10 @@ #include #include +#ifdef USE_CUDA +#include +#endif + #include #include #include @@ -233,12 +237,17 @@ std::vector> FindGroups( return features_in_group; } -std::vector> FastFeatureBundling( - const std::vector>& bin_mappers, - int** sample_indices, double** sample_values, const int* num_per_col, - int num_sample_col, data_size_t total_sample_cnt, - const std::vector& used_features, data_size_t num_data, - bool is_use_gpu, bool is_sparse, std::vector* multi_val_group) { +std::vector> FastFeatureBundling(const std::vector>& bin_mappers, + int** sample_indices, + double** sample_values, + const int* num_per_col, + int num_sample_col, + data_size_t total_sample_cnt, + const std::vector& used_features, + data_size_t num_data, + bool is_sparse, + std::vector* multi_val_group, + bool is_use_gpu) { Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer); std::vector feature_non_zero_cnt; feature_non_zero_cnt.reserve(used_features.size()); @@ -334,13 +343,28 @@ void Dataset::Construct(std::vector>* bin_mappers, "constant."); } auto features_in_group = NoGroup(used_features); + +//LGBM_CUDA +#ifdef USE_CUDA + if (io_config.device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + } +#endif + std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { - features_in_group = FastFeatureBundling( - *bin_mappers, sample_non_zero_indices, sample_values, num_per_col, - num_sample_col, static_cast(total_sample_cnt), - used_features, num_data_, io_config.device_type == std::string("gpu"), - io_config.is_enable_sparse, &group_is_multi_val); + bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); // LGBM_CUDA + features_in_group = FastFeatureBundling(*bin_mappers, + sample_non_zero_indices, + sample_values, + num_per_col, + num_sample_col, + static_cast(total_sample_cnt), + used_features, + num_data_, + io_config.is_enable_sparse, + &group_is_multi_val, + lgbm_is_gpu_used); } num_features_ = 0; @@ -758,7 +782,8 @@ void Dataset::CreateValid(const Dataset* dataset) { forced_bin_bounds_ = dataset->forced_bin_bounds_; } -void Dataset::ReSize(data_size_t num_data) { +// LGBM_CUDA Resize() returns boolean +bool Dataset::ReSize(data_size_t num_data) { if (num_data_ != num_data) { num_data_ = num_data; OMP_INIT_EX(); diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index d61f7e6489e..99feadf9f7f 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -12,6 +12,12 @@ #include #include +#ifdef USE_CUDA +#include // LGBM_CUDA +#endif + +#include // LGBM_CUDA + namespace LightGBM { template @@ -362,6 +368,9 @@ class DenseBin : public Bin { data_size_t num_data() const override { return num_data_; } + // LGBM_CUDA + void* get_data() override { return data_.data(); } + void FinishLoad() override { if (IS_4BIT) { if (buf_.empty()) { @@ -456,7 +465,11 @@ class DenseBin : public Bin { private: data_size_t num_data_; +#ifdef USE_CUDA + std::vector> data_; // LGBM_CUDA +#else std::vector> data_; +#endif std::vector buf_; DenseBin(const DenseBin& other) diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index aa3ed929713..c56cd6da99d 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -408,6 +408,9 @@ class SparseBin : public Bin { data_size_t num_data() const override { return num_data_; } + // LGBM_CUDA + void* get_data() override { return nullptr; } + void FinishLoad() override { // get total non zero size size_t pair_cnt = 0; diff --git a/src/main.cpp b/src/main.cpp index 8034da82681..ef277ac0c1f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,6 +11,10 @@ int main(int argc, char** argv) { bool success = false; try { + // LGBM_CUDA + std::chrono::duration main_time; + auto start_main_time = std::chrono::steady_clock::now(); + LightGBM::Application app(argc, argv); app.Run(); @@ -18,6 +22,9 @@ int main(int argc, char** argv) { LightGBM::Linkers::MpiFinalizeIfIsParallel(); #endif + // LGBM_CUDA + main_time = std::chrono::steady_clock::now() - start_main_time; + LightGBM::Log::Info("main::main time: %f sec", main_time * 1e-3); success = true; } catch (const std::exception& ex) { diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 0d6f9df251b..0624bb96249 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -20,9 +20,9 @@ DataParallelTreeLearner::~DataParallelTreeLearner() { } template -void DataParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { +void DataParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { //LGBM_CUDA // initialize SerialTreeLearner - TREELEARNER_T::Init(train_data, is_constant_hessian); + TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); //LGBM_CUDA // Get local rank and global machine size rank_ = Network::rank(); num_machines_ = Network::num_machines(); @@ -256,6 +256,7 @@ void DataParallelTreeLearner::Split(Tree* tree, int best_Leaf, in } // instantiate template classes, otherwise linker cannot find the code +template class DataParallelTreeLearner; // LGBM_CUDA template class DataParallelTreeLearner; template class DataParallelTreeLearner; diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp index 7a6ac031e62..01c5d2606e7 100644 --- a/src/treelearner/data_partition.hpp +++ b/src/treelearner/data_partition.hpp @@ -164,6 +164,8 @@ class DataPartition { /*! \brief used data count, used for bagging */ data_size_t used_data_count_; ParallelPartitionRunner runner_; + // LGBM_CUDA + // bool is_cuda_; }; } // namespace LightGBM diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp index c5202f3d706..5cf660ab9c9 100644 --- a/src/treelearner/feature_parallel_tree_learner.cpp +++ b/src/treelearner/feature_parallel_tree_learner.cpp @@ -19,9 +19,9 @@ template FeatureParallelTreeLearner::~FeatureParallelTreeLearner() { } -template -void FeatureParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { - TREELEARNER_T::Init(train_data, is_constant_hessian); +template //LGBM_CUDA +void FeatureParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { + TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); //LGBM_CUDA rank_ = Network::rank(); num_machines_ = Network::num_machines(); @@ -77,6 +77,7 @@ void FeatureParallelTreeLearner::FindBestSplitsFromHistograms( } // instantiate template classes, otherwise linker cannot find the code +template class FeatureParallelTreeLearner; // LGBM_CUDA template class FeatureParallelTreeLearner; template class FeatureParallelTreeLearner; } // namespace LightGBM diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 137697408e8..35ac432eba3 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -27,7 +27,7 @@ class FeatureParallelTreeLearner: public TREELEARNER_T { public: explicit FeatureParallelTreeLearner(const Config* config); ~FeatureParallelTreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian) override; + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA protected: void BeforeTrain() override; @@ -54,7 +54,7 @@ class DataParallelTreeLearner: public TREELEARNER_T { public: explicit DataParallelTreeLearner(const Config* config); ~DataParallelTreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian) override; + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA void ResetConfig(const Config* config) override; protected: @@ -108,7 +108,7 @@ class VotingParallelTreeLearner: public TREELEARNER_T { public: explicit VotingParallelTreeLearner(const Config* config); ~VotingParallelTreeLearner() { } - void Init(const Dataset* train_data, bool is_constant_hessian) override; + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; //LGBM_CUDA void ResetConfig(const Config* config) override; protected: diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index db5cd0b4395..6b02411127a 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -25,7 +25,8 @@ SerialTreeLearner::SerialTreeLearner(const Config* config) SerialTreeLearner::~SerialTreeLearner() { } -void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { +//LGBM_CUDA +void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { train_data_ = train_data; num_data_ = train_data_->num_data(); num_features_ = train_data_->num_features(); @@ -324,7 +325,18 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) { is_feature_used[feature_index] = 1; } bool use_subtract = parent_leaf_histogram_array_ != nullptr; + +#ifdef USE_CUDA + if (LGBM_config_::current_learner == use_cpu_learner){ + Log::Info("LightGBM-CUDA using CPU ConstructHistograms()"); + SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); + } + else{ + ConstructHistograms(is_feature_used, use_subtract); + } +#else ConstructHistograms(is_feature_used, use_subtract); +#endif FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree); } diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index e6ac8e3ad09..fab28542e03 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -26,6 +26,11 @@ #include "monotone_constraints.hpp" #include "split_info.hpp" +// LGBM_CUDA +#ifdef USE_CUDA +#include +#endif + #ifdef USE_GPU // Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled. // This is necessary to pin the two arrays in memory and make transferring faster. @@ -48,7 +53,8 @@ class SerialTreeLearner: public TreeLearner { ~SerialTreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian) override; + // LGBM_CUDA is_use_subset is used by CUDA only + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; void ResetTrainingData(const Dataset* train_data, bool is_constant_hessian) override { @@ -201,6 +207,11 @@ class SerialTreeLearner: public TreeLearner { std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */ std::vector> ordered_hessians_; +#elif USE_CUDA //LGBM_CUDA + /*! \brief gradients of current iteration, ordered for cache optimized */ + std::vector> ordered_gradients_; + /*! \brief hessians of current iteration, ordered for cache optimized */ + std::vector> ordered_hessians_; #else /*! \brief gradients of current iteration, ordered for cache optimized */ std::vector> ordered_gradients_; diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index 7172f6b655c..df7231e91df 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -5,6 +5,7 @@ #include #include "gpu_tree_learner.h" +#include "cuda_tree_learner.h" // LGBM_CUDA #include "parallel_tree_learner.h" #include "serial_tree_learner.h" @@ -31,6 +32,16 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con } else if (learner_type == std::string("voting")) { return new VotingParallelTreeLearner(config); } + } else if (device_type == std::string("cuda")) { // LGBM_CUDA + if (learner_type == std::string("serial")) { + return new CUDATreeLearner(config); + } else if (learner_type == std::string("feature")) { + return new FeatureParallelTreeLearner(config); + } else if (learner_type == std::string("data")) { + return new DataParallelTreeLearner(config); + } else if (learner_type == std::string("voting")) { + return new VotingParallelTreeLearner(config); + } } return nullptr; } diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 1c9c36ba8bb..58f5b88d6b0 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -19,8 +19,8 @@ VotingParallelTreeLearner::VotingParallelTreeLearner(const Config } template -void VotingParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { - TREELEARNER_T::Init(train_data, is_constant_hessian); +void VotingParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { // LGBM_CUDA + TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA rank_ = Network::rank(); num_machines_ = Network::num_machines(); @@ -454,6 +454,7 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, } // instantiate template classes, otherwise linker cannot find the code +template class VotingParallelTreeLearner; // LGBM_CUDA template class VotingParallelTreeLearner; template class VotingParallelTreeLearner; } // namespace LightGBM From 895d6e43c847a7a9e748b310fcefe2c32957ff9d Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 30 Mar 2020 16:16:02 +0000 Subject: [PATCH 002/119] Initial CUDA work --- build_LGBM.232.sh | 7 + include/LightGBM/cuda/cuda_utils.h | 38 + include/LightGBM/cuda/vector_cudahost.h | 93 ++ install_LGBM.232.sh | 7 + src/io/dense_nbits_bin.hpp | 405 +++++++++ src/treelearner/cuda_kernel_launcher.cu | 166 ++++ src/treelearner/cuda_kernel_launcher.h | 64 ++ src/treelearner/cuda_tree_learner.cpp | 1085 +++++++++++++++++++++++ src/treelearner/cuda_tree_learner.h | 315 +++++++ src/treelearner/kernels/histogram256.cu | 372 ++++++++ src/treelearner/kernels/histogram256.hu | 179 ++++ 11 files changed, 2731 insertions(+) create mode 100755 build_LGBM.232.sh create mode 100644 include/LightGBM/cuda/cuda_utils.h create mode 100644 include/LightGBM/cuda/vector_cudahost.h create mode 100755 install_LGBM.232.sh create mode 100644 src/io/dense_nbits_bin.hpp create mode 100644 src/treelearner/cuda_kernel_launcher.cu create mode 100644 src/treelearner/cuda_kernel_launcher.h create mode 100644 src/treelearner/cuda_tree_learner.cpp create mode 100644 src/treelearner/cuda_tree_learner.h create mode 100644 src/treelearner/kernels/histogram256.cu create mode 100644 src/treelearner/kernels/histogram256.hu diff --git a/build_LGBM.232.sh b/build_LGBM.232.sh new file mode 100755 index 00000000000..24b50c7dfda --- /dev/null +++ b/build_LGBM.232.sh @@ -0,0 +1,7 @@ +#!/usr/bin/bash +rm -rf build +mkdir build +cd build +#cmake -DUSE_CUDA=1 .. +cmake .. +make -j40 diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h new file mode 100644 index 00000000000..6d9407613f6 --- /dev/null +++ b/include/LightGBM/cuda/cuda_utils.h @@ -0,0 +1,38 @@ +/* + * ibmGBT: IBM CUDA Accelerated LightGBM + * + * IBM Confidential + * (C) Copyright IBM Corp. 2019. All Rights Reserved. + * + * The source code for this program is not published or otherwise + * divested of its trade secrets, irrespective of what has been + * deposited with the U.S. Copyright Office. + * + * US Government Users Restricted Rights - Use, duplication or + * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. + */ + +#ifndef LGBM_CUDA_UTILS_H +#define LGBM_CUDA_UTILS_H + +//LGBM_CUDA + +#ifdef USE_CUDA + +#include +#include +#include + +#define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#endif /* USE_CUDA */ + +#endif diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h new file mode 100644 index 00000000000..b1a235e8a22 --- /dev/null +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -0,0 +1,93 @@ +/* + * ibmGBT: IBM CUDA Accelerated LightGBM + * + * IBM Confidential + * (C) Copyright IBM Corp. 2019. All Rights Reserved. + * + * The source code for this program is not published or otherwise + * divested of its trade secrets, irrespective of what has been + * deposited with the U.S. Copyright Office. + * + * US Government Users Restricted Rights - Use, duplication or + * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. + */ + +#ifndef LGBM_CUDA_VECTOR_CH_H +#define LGBM_CUDA_VECTOR_CH_H + +#include +#include +#include + +//LGBM_CUDA + +namespace LightGBM { + +#define lgbm_device_cpu 0 +#define lgbm_device_gpu 1 +#define lgbm_device_cuda 2 + +#define use_cpu_learner 0 +#define use_gpu_learner 1 +#define use_cuda_learner 2 + +class LGBM_config_ { + public: + static int current_device; // Default: lgbm_device_cpu + static int current_learner; // Default: use_cpu_learner +}; + +} // namespace LightGBM + + +template +struct CHAllocator { + typedef T value_type; + CHAllocator() {} + template CHAllocator(const CHAllocator& other); + T* allocate(std::size_t n) + { + T* ptr; + if (n == 0) return NULL; + #ifdef USE_CUDA + if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){ + cudaError_t ret= cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); + if (ret != cudaSuccess){ + ptr = (T*) malloc(n*sizeof(T)); + } + } + else{ + ptr = (T*) malloc(n*sizeof(T)); + } + #else + ptr = (T*) malloc(n*sizeof(T)); + #endif + return ptr; + } + + void deallocate(T* p, std::size_t n) + { + if (p==NULL) return; + #ifdef USE_CUDA + if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){ + cudaPointerAttributes attributes; + cudaPointerGetAttributes (&attributes, p); + if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)){ + cudaFreeHost(p); + } + } + else{ + free(p); + } + #else + free(p); + #endif + } + +}; +template +bool operator==(const CHAllocator&, const CHAllocator&); +template +bool operator!=(const CHAllocator&, const CHAllocator&); + +#endif diff --git a/install_LGBM.232.sh b/install_LGBM.232.sh new file mode 100755 index 00000000000..7af586f4722 --- /dev/null +++ b/install_LGBM.232.sh @@ -0,0 +1,7 @@ +#!/usr/bin/bash +cd python-package +python setup.py bdist_wheel +pip uninstall -y lightgbm +cd dist +pip install lightgbm-*.whl +cd ../.. diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp new file mode 100644 index 00000000000..adf99115626 --- /dev/null +++ b/src/io/dense_nbits_bin.hpp @@ -0,0 +1,405 @@ +/*! + * Copyright (c) 2017 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_ +#define LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_ + +#include + +#include +#include +#include + +namespace LightGBM { + +class Dense4bitsBin; + +class Dense4bitsBinIterator : public BinIterator { + public: + explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) + : bin_data_(bin_data), min_bin_(static_cast(min_bin)), + max_bin_(static_cast(max_bin)), + default_bin_(static_cast(default_bin)) { + if (default_bin_ == 0) { + bias_ = 1; + } else { + bias_ = 0; + } + } + inline uint32_t RawGet(data_size_t idx) override; + inline uint32_t Get(data_size_t idx) override; + inline void Reset(data_size_t) override {} + + private: + const Dense4bitsBin* bin_data_; + uint8_t min_bin_; + uint8_t max_bin_; + uint8_t default_bin_; + uint8_t bias_; +}; + +class Dense4bitsBin : public Bin { + public: + friend Dense4bitsBinIterator; + Dense4bitsBin(data_size_t num_data) + : num_data_(num_data) { + int len = (num_data_ + 1) / 2; + data_ = std::vector(len, static_cast(0)); + buf_ = std::vector(len, static_cast(0)); + } + + ~Dense4bitsBin() { + } + + void Push(int, data_size_t idx, uint32_t value) override { + const int i1 = idx >> 1; + const int i2 = (idx & 1) << 2; + const uint8_t val = static_cast(value) << i2; + if (i2 == 0) { + data_[i1] = val; + } else { + buf_[i1] = val; + } + } + + void ReSize(data_size_t num_data) override { + if (num_data_ != num_data) { + num_data_ = num_data; + const int len = (num_data_ + 1) / 2; + data_.resize(len); + } + } + + inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override; + + void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, + const score_t* ordered_gradients, const score_t* ordered_hessians, + HistogramBinEntry* out) const override { + const data_size_t rest = num_data & 0x3; + data_size_t i = 0; + for (; i < num_data - rest; i += 4) { + const data_size_t idx0 = data_indices[i]; + const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf; + + const data_size_t idx1 = data_indices[i + 1]; + const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf; + + const data_size_t idx2 = data_indices[i + 2]; + const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf; + + const data_size_t idx3 = data_indices[i + 3]; + const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf; + + out[bin0].sum_gradients += ordered_gradients[i]; + out[bin1].sum_gradients += ordered_gradients[i + 1]; + out[bin2].sum_gradients += ordered_gradients[i + 2]; + out[bin3].sum_gradients += ordered_gradients[i + 3]; + + out[bin0].sum_hessians += ordered_hessians[i]; + out[bin1].sum_hessians += ordered_hessians[i + 1]; + out[bin2].sum_hessians += ordered_hessians[i + 2]; + out[bin3].sum_hessians += ordered_hessians[i + 3]; + + ++out[bin0].cnt; + ++out[bin1].cnt; + ++out[bin2].cnt; + ++out[bin3].cnt; + } + + for (; i < num_data; ++i) { + const data_size_t idx = data_indices[i]; + const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + out[bin].sum_gradients += ordered_gradients[i]; + out[bin].sum_hessians += ordered_hessians[i]; + ++out[bin].cnt; + } + } + + void ConstructHistogram(data_size_t num_data, + const score_t* ordered_gradients, const score_t* ordered_hessians, + HistogramBinEntry* out) const override { + const data_size_t rest = num_data & 0x3; + data_size_t i = 0; + + for (; i < num_data - rest; i += 4) { + const auto bin0 = (data_[i >> 1]) & 0xf; + const auto bin1 = (data_[i >> 1] >> 4) & 0xf; + const auto bin2 = (data_[(i >> 1) + 1]) & 0xf; + const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf; + + out[bin0].sum_gradients += ordered_gradients[i]; + out[bin1].sum_gradients += ordered_gradients[i + 1]; + out[bin2].sum_gradients += ordered_gradients[i + 2]; + out[bin3].sum_gradients += ordered_gradients[i + 3]; + + out[bin0].sum_hessians += ordered_hessians[i]; + out[bin1].sum_hessians += ordered_hessians[i + 1]; + out[bin2].sum_hessians += ordered_hessians[i + 2]; + out[bin3].sum_hessians += ordered_hessians[i + 3]; + + ++out[bin0].cnt; + ++out[bin1].cnt; + ++out[bin2].cnt; + ++out[bin3].cnt; + } + for (; i < num_data; ++i) { + const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; + out[bin].sum_gradients += ordered_gradients[i]; + out[bin].sum_hessians += ordered_hessians[i]; + ++out[bin].cnt; + } + } + + void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, + const score_t* ordered_gradients, + HistogramBinEntry* out) const override { + const data_size_t rest = num_data & 0x3; + data_size_t i = 0; + for (; i < num_data - rest; i += 4) { + const data_size_t idx0 = data_indices[i]; + const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf; + + const data_size_t idx1 = data_indices[i + 1]; + const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf; + + const data_size_t idx2 = data_indices[i + 2]; + const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf; + + const data_size_t idx3 = data_indices[i + 3]; + const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf; + + out[bin0].sum_gradients += ordered_gradients[i]; + out[bin1].sum_gradients += ordered_gradients[i + 1]; + out[bin2].sum_gradients += ordered_gradients[i + 2]; + out[bin3].sum_gradients += ordered_gradients[i + 3]; + + ++out[bin0].cnt; + ++out[bin1].cnt; + ++out[bin2].cnt; + ++out[bin3].cnt; + } + + for (; i < num_data; ++i) { + const data_size_t idx = data_indices[i]; + const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + out[bin].sum_gradients += ordered_gradients[i]; + ++out[bin].cnt; + } + } + + void ConstructHistogram(data_size_t num_data, + const score_t* ordered_gradients, + HistogramBinEntry* out) const override { + const data_size_t rest = num_data & 0x3; + data_size_t i = 0; + for (; i < num_data - rest; i += 4) { + const auto bin0 = (data_[i >> 1]) & 0xf; + const auto bin1 = (data_[i >> 1] >> 4) & 0xf; + const auto bin2 = (data_[(i >> 1) + 1]) & 0xf; + const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf; + + out[bin0].sum_gradients += ordered_gradients[i]; + out[bin1].sum_gradients += ordered_gradients[i + 1]; + out[bin2].sum_gradients += ordered_gradients[i + 2]; + out[bin3].sum_gradients += ordered_gradients[i + 3]; + + ++out[bin0].cnt; + ++out[bin1].cnt; + ++out[bin2].cnt; + ++out[bin3].cnt; + } + for (; i < num_data; ++i) { + const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; + out[bin].sum_gradients += ordered_gradients[i]; + ++out[bin].cnt; + } + } + + virtual data_size_t Split( + uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left, + uint32_t threshold, data_size_t* data_indices, data_size_t num_data, + data_size_t* lte_indices, data_size_t* gt_indices) const override { + if (num_data <= 0) { return 0; } + uint8_t th = static_cast(threshold + min_bin); + const uint8_t minb = static_cast(min_bin); + const uint8_t maxb = static_cast(max_bin); + uint8_t t_default_bin = static_cast(min_bin + default_bin); + if (default_bin == 0) { + th -= 1; + t_default_bin -= 1; + } + data_size_t lte_count = 0; + data_size_t gt_count = 0; + data_size_t* default_indices = gt_indices; + data_size_t* default_count = >_count; + if (missing_type == MissingType::NaN) { + if (default_bin <= threshold) { + default_indices = lte_indices; + default_count = <e_count; + } + data_size_t* missing_default_indices = gt_indices; + data_size_t* missing_default_count = >_count; + if (default_left) { + missing_default_indices = lte_indices; + missing_default_count = <e_count; + } + for (data_size_t i = 0; i < num_data; ++i) { + const data_size_t idx = data_indices[i]; + const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + if (bin < minb || bin > maxb || t_default_bin == bin) { + default_indices[(*default_count)++] = idx; + } else if (bin == maxb) { + missing_default_indices[(*missing_default_count)++] = idx; + } else if (bin > th) { + gt_indices[gt_count++] = idx; + } else { + lte_indices[lte_count++] = idx; + } + } + } else { + if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) { + default_indices = lte_indices; + default_count = <e_count; + } + for (data_size_t i = 0; i < num_data; ++i) { + const data_size_t idx = data_indices[i]; + const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + if (bin < minb || bin > maxb || t_default_bin == bin) { + default_indices[(*default_count)++] = idx; + } else if (bin > th) { + gt_indices[gt_count++] = idx; + } else { + lte_indices[lte_count++] = idx; + } + } + } + return lte_count; + } + + virtual data_size_t SplitCategorical( + uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, + const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data, + data_size_t* lte_indices, data_size_t* gt_indices) const override { + if (num_data <= 0) { return 0; } + data_size_t lte_count = 0; + data_size_t gt_count = 0; + data_size_t* default_indices = gt_indices; + data_size_t* default_count = >_count; + if (Common::FindInBitset(threshold, num_threahold, default_bin)) { + default_indices = lte_indices; + default_count = <e_count; + } + for (data_size_t i = 0; i < num_data; ++i) { + const data_size_t idx = data_indices[i]; + const uint32_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + if (bin < min_bin || bin > max_bin) { + default_indices[(*default_count)++] = idx; + } else if (Common::FindInBitset(threshold, num_threahold, bin - min_bin)) { + lte_indices[lte_count++] = idx; + } else { + gt_indices[gt_count++] = idx; + } + } + return lte_count; + } + + data_size_t num_data() const override { return num_data_; } + + // LGBM_CUDA + void* get_data() override { return data_.data(); } + + /*! \brief not ordered bin for dense feature */ + OrderedBin* CreateOrderedBin() const override { return nullptr; } + + void FinishLoad() override { + if (buf_.empty()) { return; } + int len = (num_data_ + 1) / 2; + for (int i = 0; i < len; ++i) { + data_[i] |= buf_[i]; + } + buf_.clear(); + } + + void LoadFromMemory(const void* memory, const std::vector& local_used_indices) override { + const uint8_t* mem_data = reinterpret_cast(memory); + if (!local_used_indices.empty()) { + const data_size_t rest = num_data_ & 1; + for (int i = 0; i < num_data_ - rest; i += 2) { + // get old bins + data_size_t idx = local_used_indices[i]; + const auto bin1 = static_cast((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf); + idx = local_used_indices[i + 1]; + const auto bin2 = static_cast((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf); + // add + const int i1 = i >> 1; + data_[i1] = (bin1 | (bin2 << 4)); + } + if (rest) { + data_size_t idx = local_used_indices[num_data_ - 1]; + data_[num_data_ >> 1] = (mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + } + } else { + for (size_t i = 0; i < data_.size(); ++i) { + data_[i] = mem_data[i]; + } + } + } + + void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { + auto other_bin = dynamic_cast(full_bin); + const data_size_t rest = num_used_indices & 1; + for (int i = 0; i < num_used_indices - rest; i += 2) { + data_size_t idx = used_indices[i]; + const auto bin1 = static_cast((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf); + idx = used_indices[i + 1]; + const auto bin2 = static_cast((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf); + const int i1 = i >> 1; + data_[i1] = (bin1 | (bin2 << 4)); + } + if (rest) { + data_size_t idx = used_indices[num_used_indices - 1]; + data_[num_used_indices >> 1] = (other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + } + } + + void SaveBinaryToFile(const VirtualFileWriter* writer) const override { + writer->Write(data_.data(), sizeof(uint8_t) * data_.size()); + } + + size_t SizesInByte() const override { + return sizeof(uint8_t) * data_.size(); + } + + Dense4bitsBin* Clone() override { + return new Dense4bitsBin(*this); + } + + protected: + Dense4bitsBin(const Dense4bitsBin& other) + : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {} + + data_size_t num_data_; + std::vector data_; + std::vector buf_; +}; + +uint32_t Dense4bitsBinIterator::Get(data_size_t idx) { + const auto bin = (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + if (bin >= min_bin_ && bin <= max_bin_) { + return bin - min_bin_ + bias_; + } else { + return default_bin_; + } +} + +uint32_t Dense4bitsBinIterator::RawGet(data_size_t idx) { + return (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; +} + +inline BinIterator* Dense4bitsBin::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const { + return new Dense4bitsBinIterator(this, min_bin, max_bin, default_bin); +} + +} // namespace LightGBM +#endif // LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_ diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu new file mode 100644 index 00000000000..d084abe4f23 --- /dev/null +++ b/src/treelearner/cuda_kernel_launcher.cu @@ -0,0 +1,166 @@ +#ifdef USE_CUDA + +#include "cuda_kernel_launcher.h" +#include +#include +#include + +using namespace LightGBM; + +void cuda_histogram( + data_size_t leaf_num_data, + data_size_t num_data, + bool use_all_features, + bool is_constant_hessian, + int num_workgroups, + cudaStream_t stream, + uint8_t* arg0, + uint8_t* arg1, + data_size_t arg2, + data_size_t* arg3, + data_size_t arg4, + score_t* arg5, + score_t* arg6, + score_t arg6_const, + char* arg7, + volatile int* arg8, + void* arg9, + size_t exp_workgroups_per_feature) { + + + if (leaf_num_data == num_data) { + + if (use_all_features){ + if (!is_constant_hessian) { + histogram256<<>>( + arg0, + arg1, + arg2, + reinterpret_cast(arg3), + arg4, + arg5, + static_cast(arg6), + arg7, + arg8, + static_cast(arg9), + exp_workgroups_per_feature + ); + } + else { + histogram256<<>>( + arg0, + arg1, + arg2, + reinterpret_cast(arg3), + arg4, + arg5, + arg6_const, + arg7, + arg8, + static_cast(arg9), + exp_workgroups_per_feature + ); + } + } + else{ + if (!is_constant_hessian) { + histogram256_fulldata<<>>( + arg0, + arg1, + arg2, + reinterpret_cast(arg3), + arg4, + arg5, + static_cast(arg6), + arg7, + arg8, + static_cast(arg9), + exp_workgroups_per_feature); + } + else { + histogram256_fulldata<<>>( + arg0, + arg1, + arg2, + reinterpret_cast(arg3), + arg4, + arg5, + arg6_const, + arg7, + arg8, + static_cast(arg9), + exp_workgroups_per_feature); + } + } + } + else { + if (use_all_features) { + // seems all features is always enabled, so this should be the same as fulldata + if (!is_constant_hessian) { + + histogram256<<>>( + arg0, + arg1, + arg2, + reinterpret_cast(arg3), + arg4, + arg5, + static_cast(arg6), + arg7, + arg8, + static_cast(arg9), + exp_workgroups_per_feature + ); + } + else { + histogram256<<>>( + arg0, + arg1, + arg2, + reinterpret_cast(arg3), + arg4, + arg5, + arg6_const, + arg7, + arg8, + static_cast(arg9), + exp_workgroups_per_feature + ); + } + } + else { + if (!is_constant_hessian) { + histogram256<<>>( + arg0, + arg1, + arg2, + reinterpret_cast(arg3), + arg4, + arg5, + static_cast(arg6), + arg7, + arg8, + static_cast(arg9), + exp_workgroups_per_feature + ); + } + else { + histogram256<<>>( + arg0, + arg1, + arg2, + reinterpret_cast(arg3), + arg4, + arg5, + arg6_const, + arg7, + arg8, + static_cast(arg9), + exp_workgroups_per_feature + ); + } + } + } +} + +#endif // USE_CUDA diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h new file mode 100644 index 00000000000..ae7d3498e83 --- /dev/null +++ b/src/treelearner/cuda_kernel_launcher.h @@ -0,0 +1,64 @@ +#ifndef LGBM_KERNEL_LAUNCHER +#define LGBM_KERNEL_LAUNCHER + +#ifdef USE_CUDA +// what should I include?? +#include "kernels/histogram256.hu" // kernel, acc_type, data_size_t, uchar, score_t +#include + +struct ThreadData { + // device id + int device_id; + // parameters for cuda_histogram + data_size_t leaf_num_data; + data_size_t num_data; + bool use_all_features; + bool is_constant_hessian; + int num_workgroups; + cudaStream_t stream; + uint8_t* device_features; + uint8_t* device_feature_masks; + //data_size_t num_data; + data_size_t* device_data_indices; + //data_size_t leaf_num_data; + score_t* device_gradients; + score_t* device_hessians; + score_t hessians_const; + char* device_subhistograms; + volatile int* sync_counters; + void* device_histogram_outputs; + size_t exp_workgroups_per_feature; + // cuda events + cudaEvent_t* kernel_start; + cudaEvent_t* kernel_wait_obj; + std::chrono::duration* kernel_input_wait_time; + // copy histogram + size_t output_size; + char* host_histogram_output; + cudaEvent_t* histograms_wait_obj; +}; + + +void cuda_histogram( + data_size_t leaf_num_data, + data_size_t num_data, + bool use_all_features, + bool is_constant_hessian, + int num_workgroups, + cudaStream_t stream, + uint8_t* arg0, + uint8_t* arg1, + data_size_t arg2, + data_size_t* arg3, + data_size_t arg4, + score_t* arg5, + score_t* arg6, + score_t arg6_const, + char* arg7, + volatile int* arg8, + void* arg9, + size_t exp_workgroups_per_feature); + + +#endif //USE_CUDA +#endif // LGBM_KERNEL_LAUNCHER diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp new file mode 100644 index 00000000000..c45df55cacf --- /dev/null +++ b/src/treelearner/cuda_tree_learner.cpp @@ -0,0 +1,1085 @@ +#ifdef USE_CUDA +#include "cuda_tree_learner.h" +#include "../io/dense_bin.hpp" +#include "../io/dense_nbits_bin.hpp" + +#include +#include +#include + +#include +#include + +#include + +#define cudaMemcpy_DEBUG 0 // 1: DEBUG cudaMemcpy +#define ResetTrainingData_DEBUG 0 // 1: Debug ResetTrainingData + +#include + +#define GPU_DEBUG 0 + +static void *launch_cuda_histogram(void *thread_data) { + ThreadData td = *(ThreadData*)thread_data; + int device_id = td.device_id; + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + + // launch cuda kernel + cuda_histogram(td.leaf_num_data, td.num_data, td.use_all_features, + td.is_constant_hessian, td.num_workgroups, td.stream, + td.device_features, + td.device_feature_masks, + td.num_data, + reinterpret_cast(td.device_data_indices), + td.leaf_num_data, + td.device_gradients, + td.device_hessians, td.hessians_const, + td.device_subhistograms, td.sync_counters, + td.device_histogram_outputs, + td.exp_workgroups_per_feature); + + CUDASUCCESS_OR_FATAL(cudaGetLastError()); + + return NULL; +} + +/* +static void *wait_event(void *wait_obj) { + CUDASUCCESS_OR_FATAL(cudaEventSynchronize(*(cudaEvent_t *)wait_obj)); +}*/ + +namespace LightGBM { + +CUDATreeLearner::CUDATreeLearner(const Config* config) + :SerialTreeLearner(config) { + use_bagging_ = false; + nthreads_ = 0; + if(config->gpu_use_dp && USE_DP_FLOAT) Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!"); + else Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!"); +} + +CUDATreeLearner::~CUDATreeLearner() { +} + + +void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { + + + // initialize SerialTreeLearner + SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset); + + // some additional variables needed for GPU trainer + num_feature_groups_ = train_data_->num_feature_groups(); + + + // LGBM_CUDA: use subset of training data for bagging + is_use_subset_ = is_use_subset; + + // Initialize GPU buffers and kernels & LGBM_CUDA: get device info + InitGPU(config_->num_gpu); // LGBM_CUDA + + +} + +// some functions used for debugging the GPU histogram construction + +void PrintHistograms(HistogramBinEntry* h, size_t size) { + size_t total = 0; + for (size_t i = 0; i < size; ++i) { + printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt); + total += h[i].cnt; + if ((i & 3) == 3) + printf("\n"); + } + printf("\nTotal examples: %lu\n", total); +} + +union Float_t +{ + int64_t i; + double f; + static int64_t ulp_diff(Float_t a, Float_t b) { + return abs(a.i - b.i); + } +}; + + +void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) { + + size_t i; + Float_t a, b; + for (i = 0; i < size; ++i) { + a.f = h1[i].sum_gradients; + b.f = h2[i].sum_gradients; + int32_t ulps = Float_t::ulp_diff(a, b); + if (fabs(h1[i].cnt - h2[i].cnt != 0)) { + printf("idx: %lu, %d != %d, (diff: %d, err_rate: %f)\n", i, h1[i].cnt, h2[i].cnt, h1[i].cnt - h2[i].cnt, (float)(h1[i].cnt - h2[i].cnt)/h2[i].cnt); + goto err; + } else { + printf("idx: %lu, %d == %d\n", i, h1[i].cnt, h2[i].cnt); + printf("idx: %lu, pass\n", i); + } + if (ulps > 0) { + printf("idx: %ld, grad %g != %g\n", i, h1[i].sum_gradients, h2[i].sum_gradients); + //printf("idx: %ld, grad %g != %g (%d ULPs)\n", i, h1[i].sum_gradients, h2[i].sum_gradients, ulps); + goto err; + } + a.f = h1[i].sum_hessians; + b.f = h2[i].sum_hessians; + ulps = Float_t::ulp_diff(a, b); + if (ulps > 0) { + printf("idx: %ld, hessian %g != %g\n", i, h1[i].sum_hessians, h2[i].sum_hessians); + //printf("idx: %ld, hessian %g != %g (%d ULPs)\n", i, h1[i].sum_hessians, h2[i].sum_hessians, ulps); + // goto err; + } + } + return; +err: + Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i); +} + +int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { + + // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples. + // also guarantee that there are at least 2K examples per workgroup + + double x = 256.0 / num_dense_feature_groups_; + + int exp_workgroups_per_feature = (int)ceil(log2(x)); + double t = leaf_num_data / 1024.0; + + Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons" + "Best workgroup size per feature for full utilization is %d\n", (int)ceil(t), (1 << exp_workgroups_per_feature)); + + exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, (int)ceil(log((double)t)/log(2.0))); + if (exp_workgroups_per_feature < 0) + exp_workgroups_per_feature = 0; + if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature) + exp_workgroups_per_feature = kMaxLogWorkgroupsPerFeature; + + return exp_workgroups_per_feature; +} + +void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) { + + // we have already copied ordered gradients, ordered hessians and indices to GPU + // decide the best number of workgroups working on one feature4 tuple + // set work group size based on feature size + // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple + + + int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data); + std::vector num_gpu_workgroups; + ThreadData *thread_data = (ThreadData*)malloc(sizeof(ThreadData) * num_gpu_); + + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + int num_gpu_feature_groups = num_gpu_feature_groups_[device_id]; + int num_workgroups = (1 << exp_workgroups_per_feature) * num_gpu_feature_groups; + num_gpu_workgroups.push_back(num_workgroups); + if (num_workgroups > preallocd_max_num_wg_[device_id]) { + preallocd_max_num_wg_.at(device_id) = num_workgroups; + CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id])); + cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_); + } + //set thread_data + SetThreadData(thread_data, device_id, leaf_num_data, use_all_features, + num_workgroups, exp_workgroups_per_feature); + } + + + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))){ + fprintf(stderr, "Error in creating threads. Exiting\n"); + exit(0); + } + } + + /* Wait for the threads to finish */ + + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + if (pthread_join(*(cpu_threads_[device_id]), NULL)){ + fprintf(stderr, "Error in joining threads. Exiting\n"); + exit(0); + } + } + + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + + // copy the results asynchronously. Size depends on if double precision is used + + size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; + size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; + + + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id])); + + + CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id])); + } + +} + + +template +void CUDATreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { + HistType* hist_outputs = (HistType*) host_histogram_outputs_; + + //#pragma omp parallel for schedule(static, num_gpu_) + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + + auto start_time = std::chrono::steady_clock::now(); + + // when the output is ready, the computation is done + CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id])); + } + + #pragma omp parallel for schedule(static) + for(int i = 0; i < num_dense_feature_groups_; ++i) { + if (!feature_masks_[i]) { + continue; + } + int dense_group_index = dense_feature_group_map_[i]; + auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index); + int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); + + for (int j = 0; j < bin_size; ++j) { + old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients; + old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians; + old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt; + } + } + +} + +// LGBM_CUDA +void CUDATreeLearner::CountDenseFeatureGroups() { + + num_dense_feature_groups_ = 0; + + for (int i = 0; i < num_feature_groups_; ++i) { + if (ordered_bins_[i] == nullptr) { + num_dense_feature_groups_++; + } + } + if (!num_dense_feature_groups_) { + Log::Warning("GPU acceleration is disabled because no non-trival dense features can be found"); + } + +} + +// LGBM_CUDA +void CUDATreeLearner::prevAllocateGPUMemory() { + + + // how many feature-group tuples we have + // leave some safe margin for prefetching + // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature + + allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature); + + // clear sparse/dense maps + + dense_feature_group_map_.clear(); + sparse_feature_group_map_.clear(); + + // do nothing it there is no dense feature + if (!num_dense_feature_groups_) { + return; + } + + // LGBM_CUDA: calculate number of feature groups per gpu + num_gpu_feature_groups_.resize(num_gpu_); + offset_gpu_feature_groups_.resize(num_gpu_); + int num_features_per_gpu = num_dense_feature_groups_ / num_gpu_; + int remain_features = num_dense_feature_groups_ - num_features_per_gpu * num_gpu_; + + int offset = 0; + + for(int i = 0; i < num_gpu_; ++i) { + offset_gpu_feature_groups_.at(i) = offset; + num_gpu_feature_groups_.at(i) = (i < remain_features)? num_features_per_gpu + 1 : num_features_per_gpu; + offset += num_gpu_feature_groups_.at(i); + } + +#if 0 + // allocate feature mask, for disabling some feature-groups' histogram calculation + if (feature_masks_.data() != NULL) { + cudaPointerAttributes attributes; + cudaPointerGetAttributes (&attributes, feature_masks_.data()); + + if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)){ + CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data())); + } + } +#endif + + feature_masks_.resize(num_dense_feature_groups_); + Log::Debug("Resized feature masks"); + + ptr_pinned_feature_masks_ = feature_masks_.data(); + Log::Debug("Memset pinned_feature_masks_"); + memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_); + + // histogram bin entry size depends on the precision (single/double) + hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry); + + // host_size histogram outputs + // host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_); + + CUDASUCCESS_OR_FATAL(cudaHostAlloc( (void **)&host_histogram_outputs_, (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_),cudaHostAllocPortable)); + + // LGBM_CUDA + nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_); + nthreads_ = std::max(nthreads_, 1); +} + +// LGBM_CUDA: allocate GPU memory for each GPU +void CUDATreeLearner::AllocateGPUMemory() { + + + #pragma omp parallel for schedule(static, num_gpu_) + + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + // do nothing it there is no gpu feature + int num_gpu_feature_groups = num_gpu_feature_groups_[device_id]; + if (num_gpu_feature_groups) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + + // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers) + if ( device_features_[device_id] != NULL ) { + CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id])); + } + + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_features_[device_id]), (size_t)num_gpu_feature_groups * num_data_ * sizeof(uint8_t))); + Log::Debug("Allocated device_features_ addr=%p sz=%lu", device_features_[device_id], num_gpu_feature_groups * num_data_); + + // allocate space for gradients and hessians on device + // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed + + if (device_gradients_[device_id] != NULL){ + CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id])); + } + + if (device_hessians_[device_id] != NULL){ + CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id])); + } + + if (device_feature_masks_[device_id] != NULL){ + CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id])); + } + + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), (size_t) allocated_num_data_ * sizeof(score_t))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]), (size_t) allocated_num_data_ * sizeof(score_t))); + + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), (size_t) num_gpu_feature_groups)); + + // copy indices to the device + + if (device_feature_masks_[device_id] != NULL){ + CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); + } + + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), (size_t) allocated_num_data_ * sizeof(data_size_t))); + CUDASUCCESS_OR_FATAL(cudaMemsetAsync(device_data_indices_[device_id], 0, allocated_num_data_ * sizeof(data_size_t), stream_[device_id])); + + Log::Debug("Memset device_data_indices_"); + + // create output buffer, each feature has a histogram with device_bin_size_ bins, + // each work group generates a sub-histogram of dword_features_ features. + + if (!device_subhistograms_[device_id]) { + + // only initialize once here, as this will not need to change when ResetTrainingData() is called + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_)); + + Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]); + + } + + // create atomic counters for inter-group coordination + CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int))); + CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int))); + + // The output buffer is allocated to host directly, to overlap compute and data transfer + CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), (size_t) num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_)); + } + } + +} + +void CUDATreeLearner::ResetGPUMemory() { + + // clear sparse/dense maps + dense_feature_group_map_.clear(); + sparse_feature_group_map_.clear(); + +} + +// LGBM_CUDA +void CUDATreeLearner::copyDenseFeature() { + + if (num_feature_groups_ == 0){ + LGBM_config_::current_learner=use_cpu_learner; + return; + } + +// auto start_time = std::chrono::steady_clock::now(); + Log::Debug("Started copying dense features from CPU to GPU"); + // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes) + size_t copied_feature = 0; + // set device info + int device_id = 0; + uint8_t* device_features = device_features_[device_id]; + Log::Debug("Started copying dense features from CPU to GPU - 1"); + + for (int i = 0; i < num_feature_groups_; ++i) { + // looking for dword_features_ non-sparse feature-groups + if (ordered_bins_[i] == nullptr) { + dense_feature_group_map_.push_back(i); + auto sizes_in_byte = train_data_->FeatureGroupSizesInByte(i); + void* tmp_data = train_data_->FeatureGroupData(i); + Log::Debug("Started copying dense features from CPU to GPU - 2"); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(&device_features[copied_feature * num_data_], tmp_data, sizes_in_byte, cudaMemcpyHostToDevice, stream_[device_id])); + Log::Debug("Started copying dense features from CPU to GPU - 3"); + copied_feature++; + // reset device info + if(copied_feature == (size_t) num_gpu_feature_groups_[device_id]) { + CUDASUCCESS_OR_FATAL(cudaEventRecord(features_future_[device_id], stream_[device_id])); + device_id += 1; + copied_feature = 0; + if(device_id < num_gpu_) { + device_features = device_features_[device_id]; + //CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + } + } + } + else { + sparse_feature_group_map_.push_back(i); + } + } + + // data transfer time // LGBM_CUDA: async copy, so it is not the real data transfer time + // std::chrono::duration end_time = std::chrono::steady_clock::now() - start_time; + +} + + + +// LGBM_CUDA: InitGPU w/ num_gpu +void CUDATreeLearner::InitGPU(int num_gpu) { + + // Get the max bin size, used for selecting best GPU kernel + + max_num_bin_ = 0; + + #if GPU_DEBUG >= 1 + printf("bin_size: "); + #endif + for (int i = 0; i < num_feature_groups_; ++i) { + #if GPU_DEBUG >= 1 + printf("%d, ", train_data_->FeatureGroupNumBin(i)); + #endif + max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i)); + } + + if (max_num_bin_ <= 16) { + device_bin_size_ = 256; //LGBM_CUDA + dword_features_ = 1; // LGBM_CUDA + } + else if (max_num_bin_ <= 64) { + device_bin_size_ = 256; //LGBM_CUDA + dword_features_ = 1; // LGBM_CUDA + } + else if ( max_num_bin_ <= 256) { + Log::Debug("device_bin_size_ = 256"); + device_bin_size_ = 256; + dword_features_ = 1; // LGBM_CUDA + } + else { + Log::Fatal("bin size %d cannot run on GPU", max_num_bin_); + } + if(max_num_bin_ == 65) { + Log::Warning("Setting max_bin to 63 is sugguested for best performance"); + } + if(max_num_bin_ == 17) { + Log::Warning("Setting max_bin to 15 is sugguested for best performance"); + } + + // LGBM_CUDA: get num_dense_feature_groups_ + CountDenseFeatureGroups(); + + + if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_; + + // LGBM_CUDA: initialize GPU + int gpu_count; + + CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count)); + num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu; + + // LGBM_CUDA: set cpu threads + cpu_threads_ = (pthread_t **)malloc(sizeof(pthread_t *)*num_gpu_); + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + cpu_threads_[device_id] = (pthread_t *)malloc(sizeof(pthread_t)); + } + + // LGBM_CUDA: resize device memory pointers + device_features_.resize(num_gpu_); + device_gradients_.resize(num_gpu_); + device_hessians_.resize(num_gpu_); + device_feature_masks_.resize(num_gpu_); + device_data_indices_.resize(num_gpu_); + sync_counters_.resize(num_gpu_); + device_subhistograms_.resize(num_gpu_); + device_histogram_outputs_.resize(num_gpu_); + + // LGBM_CUDA: create stream & events to handle multiple GPUs + preallocd_max_num_wg_.resize(num_gpu_, 1024); + stream_.resize(num_gpu_); + hessians_future_.resize(num_gpu_); + gradients_future_.resize(num_gpu_); + indices_future_.resize(num_gpu_); + features_future_.resize(num_gpu_); + kernel_start_.resize(num_gpu_); + kernel_wait_obj_.resize(num_gpu_); + histograms_wait_obj_.resize(num_gpu_); + + // for debuging + kernel_time_.resize(num_gpu_, 0); + kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); + + for(int i = 0; i < num_gpu_; ++i) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(i)); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&(stream_[i]))); + CUDASUCCESS_OR_FATAL(cudaEventCreate(&(hessians_future_[i]))); + CUDASUCCESS_OR_FATAL(cudaEventCreate(&(gradients_future_[i]))); + CUDASUCCESS_OR_FATAL(cudaEventCreate(&(indices_future_[i]))); + CUDASUCCESS_OR_FATAL(cudaEventCreate(&(features_future_[i]))); + CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_start_[i]))); + CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_wait_obj_[i]))); + CUDASUCCESS_OR_FATAL(cudaEventCreate(&(histograms_wait_obj_[i]))); + } + + prevAllocateGPUMemory(); + + AllocateGPUMemory(); + + // LGBM_CUDA: copy dense feature data from cpu to gpu only when we use entire training data for training + + if (!is_use_subset_) { + Log::Debug("copyDenseFeature at the initialization\n"); + copyDenseFeature(); // LGBM_CUDA + } + +} + +Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, + bool is_constant_hessian, Json& forced_split_json) { + + // check if we need to recompile the GPU kernel (is_constant_hessian changed) + // this should rarely occur + + if (is_constant_hessian != is_constant_hessian_) { + Log::Debug("Recompiling GPU kernel because hessian is %sa constant now", is_constant_hessian ? "" : "not "); + is_constant_hessian_ = is_constant_hessian; + } + + Tree *ret = SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json); + + return ret; +} + +void CUDATreeLearner::ResetTrainingData(const Dataset* train_data) { + + // LGBM_CUDA: check data size + data_size_t old_num_data = num_data_; + + SerialTreeLearner::ResetTrainingData(train_data); + + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + serial_time = std::chrono::steady_clock::now() - start_serial_time; + #endif + + num_feature_groups_ = train_data_->num_feature_groups(); + + // GPU memory has to been reallocated because data may have been changed + + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + auto start_alloc_gpu_time = std::chrono::steady_clock::now(); + #endif + + // LGBM_CUDA: AllocateGPUMemory only when the number of data increased + + int old_num_feature_groups = num_dense_feature_groups_; + CountDenseFeatureGroups(); + if ((old_num_data < num_data_) && (old_num_feature_groups < num_dense_feature_groups_)) { + prevAllocateGPUMemory(); + AllocateGPUMemory(); + } else { + ResetGPUMemory(); + } + + copyDenseFeature(); + + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + alloc_gpu_time = std::chrono::steady_clock::now() - start_alloc_gpu_time; + #endif + + // setup GPU kernel arguments after we allocating all the buffers + + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + auto start_set_arg_time = std::chrono::steady_clock::now(); + #endif + + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + set_arg_time = std::chrono::steady_clock::now() - start_set_arg_time; + reset_training_data_time = std::chrono::steady_clock::now() - start_reset_training_data_time; + Log::Info("reset_training_data_time: %f secs.", reset_training_data_time.count() * 1e-3); + Log::Info("serial_time: %f secs.", serial_time.count() * 1e-3); + Log::Info("alloc_gpu_time: %f secs.", alloc_gpu_time.count() * 1e-3); + Log::Info("set_arg_time: %f secs.", set_arg_time.count() * 1e-3); + #endif +} + +void CUDATreeLearner::BeforeTrain() { + + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + std::chrono::duration device_hessians_time = std::chrono::milliseconds(0); + std::chrono::duration device_gradients_time = std::chrono::milliseconds(0); + #endif + + SerialTreeLearner::BeforeTrain(); + + #if GPU_DEBUG >= 2 + printf("CUDATreeLearner::BeforeTrain() Copying initial full gradients and hessians to device\n"); + #endif + + // Copy initial full hessians and gradients to GPU. + // We start copying as early as possible, instead of at ConstructHistogram(). + + if ((hessians_ != NULL) && (gradients_ != NULL)){ + if (!use_bagging_ && num_dense_feature_groups_) { + + Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_); + + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + if (!is_constant_hessian_) { + Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_"); + + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + auto start_device_hessians_time = std::chrono::steady_clock::now(); + #endif + + //const data_size_t* indices = data_partition_->indices(); + //data_size_t cnt = data_partition_->leaf_count(0); + + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + + CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); + + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time; + #endif + + Log::Debug("queued copy of device_hessians_"); + } + + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + auto start_device_gradients_time = std::chrono::steady_clock::now(); + #endif + + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); + + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time; + #endif + + Log::Debug("CUDATreeLearner::BeforeTrain: issued gradients_ -> device_gradients_"); + } + } + } + +#if 0 + SerialTreeLearner::BeforeTrain(); +#endif + + // use bagging + if ((hessians_ != NULL) && (gradients_ != NULL)){ + if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) { + + // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram() + // copy used gradients and hessians to ordered buffer + + const data_size_t* indices = data_partition_->indices(); + data_size_t cnt = data_partition_->leaf_count(0); + + // transfer the indices to GPU + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); + + if (!is_constant_hessian_) { + + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); + + } + + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); + } + + } + } + +} + +bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { + + int smaller_leaf; + + data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf); + data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf); + + + // only have root + if (right_leaf < 0) { + smaller_leaf = -1; + } else if (num_data_in_left_child < num_data_in_right_child) { + smaller_leaf = left_leaf; + } else { + smaller_leaf = right_leaf; + } + + // Copy indices, gradients and hessians as early as possible + if (smaller_leaf >= 0 && num_dense_feature_groups_) { + // only need to initialize for smaller leaf + // Get leaf boundary + const data_size_t* indices = data_partition_->indices(); + data_size_t begin = data_partition_->leaf_begin(smaller_leaf); + data_size_t end = begin + data_partition_->leaf_count(smaller_leaf); + + // copy indices to the GPU: + #if GPU_DEBUG >= 2 + #endif + + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], &indices[begin], (end-begin) * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); + } + } + + const bool ret = SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf); + + return ret; +} + +bool CUDATreeLearner::ConstructGPUHistogramsAsync( + const std::vector& is_feature_used, + const data_size_t* data_indices, data_size_t num_data) { + + + if (num_data <= 0) { + return false; + } + + + // do nothing if no features can be processed on GPU + if (!num_dense_feature_groups_) { + Log::Debug("no dense feature groups, returning"); + return false; + } + + // copy data indices if it is not null + if (data_indices != nullptr && num_data != num_data_) { + + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], data_indices, num_data * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); + + } + } + + // converted indices in is_feature_used to feature-group indices + std::vector is_feature_group_used(num_feature_groups_, 0); + + #pragma omp parallel for schedule(static,1024) if (num_features_ >= 2048) + for (int i = 0; i < num_features_; ++i) { + if(is_feature_used[i]) { + int feature_group = train_data_->Feature2Group(i); // LGBM_CUDA + is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group)<=16)? 2 : 1; // LGBM_CUDA + } + } + + // construct the feature masks for dense feature-groups + int used_dense_feature_groups = 0; + #pragma omp parallel for schedule(static,1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048) + for (int i = 0; i < num_dense_feature_groups_; ++i) { + if (is_feature_group_used[dense_feature_group_map_[i]]) { + //feature_masks_[i] = 1; + feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]]; + ++used_dense_feature_groups; + } + else { + feature_masks_[i] = 0; + } + } + bool use_all_features = used_dense_feature_groups == num_dense_feature_groups_; + // if no feature group is used, just return and do not use GPU + if (used_dense_feature_groups == 0) { + return false; + } + +#if GPU_DEBUG >= 1 + printf("CudaTreeLearner::ConstructGPUHistogramsAsync() Feature masks: "); + for (unsigned int i = 0; i < feature_masks_.size(); ++i) { + printf("%d ", feature_masks_[i]); + } + printf("\n"); + printf("CudaTreeLearner::ConstructGPUHistogramsAsync() %d feature groups, %d used, %d\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features); +#endif + + // if not all feature groups are used, we need to transfer the feature mask to GPU + // otherwise, we will use a specialized GPU kernel with all feature groups enabled + // LGBM_CUDA FIXME: No waiting mark for feature mask + + // LGBM_CUDA We now copy even if all features are used. + + //#pragma omp parallel for schedule(static, num_gpu_) + for(int device_id = 0; device_id < num_gpu_; ++device_id) { + int offset = offset_gpu_feature_groups_[device_id]; + CUDASUCCESS_OR_FATAL(cudaMemcpy(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice)); + //CUDASUCCESS_OR_FATAL(cudaMemcpy(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id])); + } + + // All data have been prepared, now run the GPU kernel + + GPUHistogram(num_data, use_all_features); + + return true; +} + +void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) { + + //LGBM_CUDA + auto start_time = std::chrono::steady_clock::now(); + + std::vector is_sparse_feature_used(num_features_, 0); + std::vector is_dense_feature_used(num_features_, 0); + int num_dense_features=0, num_sparse_features=0; + + #pragma omp parallel for schedule(static) + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + + if (!is_feature_used_[feature_index]) continue; + if (!is_feature_used[feature_index]) continue; + if (ordered_bins_[train_data_->Feature2Group(feature_index)]) { + is_sparse_feature_used[feature_index] = 1; + num_sparse_features++; + } + else { + is_dense_feature_used[feature_index] = 1; + num_dense_features++; + } + } + + // construct smaller leaf + HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; + + // Check workgroups per feature4 tuple.. + int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); + + // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU + if (exp_workgroups_per_feature == 0){ + return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); + } + + // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU + bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, + nullptr, smaller_leaf_splits_->num_data_in_leaf()); + + // then construct sparse features on CPU + // We set data_indices to null to avoid rebuilding ordered gradients/hessians + if (num_sparse_features > 0){ + train_data_->ConstructHistograms(is_sparse_feature_used, + nullptr, smaller_leaf_splits_->num_data_in_leaf(), + smaller_leaf_splits_->LeafIndex(), + ordered_bins_, gradients_, hessians_, + ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, + ptr_smaller_leaf_hist_data); + } + + // wait for GPU to finish, only if GPU is actually used + if (is_gpu_used) { + if (config_->gpu_use_dp) { + // use double precision + WaitAndGetHistograms(ptr_smaller_leaf_hist_data); + } + else { + // use single precision + WaitAndGetHistograms(ptr_smaller_leaf_hist_data); + } + } + + // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem + // #define GPU_DEBUG_COMPARE +#ifdef GPU_DEBUG_COMPARE + printf("Start Comparing_Histogram between GPU and CPU num_dense_feature_groups_=%d\n",num_dense_feature_groups_); + bool compare = true; + for (int i = 0; i < num_dense_feature_groups_; ++i) { + if (!feature_masks_[i]) + continue; + int dense_feature_group_index = dense_feature_group_map_[i]; + size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index); + HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; + HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index); + HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size]; + data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); + + std::copy(current_histogram, current_histogram + size, gpu_histogram); + std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry)); + if ( num_data == num_data_ ) { + if ( is_constant_hessian_ ) { + printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_"); + train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + num_data, + gradients_, + current_histogram); + } else { + printf("ConstructHistogram(): num_data == num_data_ "); + train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + num_data, + gradients_, hessians_, + current_histogram); + } + } else { + if ( is_constant_hessian_ ) { + printf("ConstructHistogram(): is_constant_hessian_"); + train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + smaller_leaf_splits_->data_indices(), + num_data, + ordered_gradients_.data(), + current_histogram); + } else { + printf("ConstructHistogram(): 4"); + train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + smaller_leaf_splits_->data_indices(), + num_data, + ordered_gradients_.data(), ordered_hessians_.data(), + current_histogram); + } + } + if ( (num_data != num_data_) && compare ) { + CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index); + compare = false; + } + CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index); + std::copy(gpu_histogram, gpu_histogram + size, current_histogram); + delete [] gpu_histogram; + //break; // LGBM_CUDA: see only first feature info + } + printf("End Comparing Histogram between GPU and CPU\n"); +// #endif +#endif + + if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { + + // construct larger leaf + + HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1; + + is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, + larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf()); + + // then construct sparse features on CPU + // We set data_indices to null to avoid rebuilding ordered gradients/hessians + + if (num_sparse_features > 0){ + train_data_->ConstructHistograms(is_sparse_feature_used, + nullptr, larger_leaf_splits_->num_data_in_leaf(), + larger_leaf_splits_->LeafIndex(), + ordered_bins_, gradients_, hessians_, + ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, + ptr_larger_leaf_hist_data); + } + + // wait for GPU to finish, only if GPU is actually used + + if (is_gpu_used) { + if (config_->gpu_use_dp) { + // use double precision + WaitAndGetHistograms(ptr_larger_leaf_hist_data); + } + else { + // use single precision + WaitAndGetHistograms(ptr_larger_leaf_hist_data); + } + } + } +} + +void CUDATreeLearner::FindBestSplits() { + + SerialTreeLearner::FindBestSplits(); + +#if GPU_DEBUG >= 3 + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + if (!is_feature_used_[feature_index]) continue; + if (parent_leaf_histogram_array_ != nullptr + && !parent_leaf_histogram_array_[feature_index].is_splittable()) { + smaller_leaf_histogram_array_[feature_index].set_is_splittable(false); + continue; + } + size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; + printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d smaller leaf:\n", feature_index, bin_size); + PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); + if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; } + printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d larger leaf:\n", feature_index, bin_size); + + PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); + } +#endif +} + +void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) { + const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf]; + +#if GPU_DEBUG >= 2 + printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian); +#endif + + SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf); + + if (Network::num_machines() == 1) { + // do some sanity check for the GPU algorithm + if (best_split_info.left_count < best_split_info.right_count) { + if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) || + (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) { + Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); + } + } else { + double smaller_min = smaller_leaf_splits_->min_constraint(); + double smaller_max = smaller_leaf_splits_->max_constraint(); + double larger_min = larger_leaf_splits_->min_constraint(); + double larger_max = larger_leaf_splits_->max_constraint(); + smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); + larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); + smaller_leaf_splits_->SetValueConstraint(smaller_min, smaller_max); + larger_leaf_splits_->SetValueConstraint(larger_min, larger_max); + if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) || + (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) { + Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); + } + } + } + +} + +} // namespace LightGBM +#undef cudaMemcpy_DEBUG +#endif // USE_CUDA diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h new file mode 100644 index 00000000000..e5a24aeb8f5 --- /dev/null +++ b/src/treelearner/cuda_tree_learner.h @@ -0,0 +1,315 @@ +#pragma once +#ifndef LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ +#define LGBM_TREELEARNET_CUDA_TREE_LEARNER_H_ + +#include +#include +#include +#include +#include +#include "feature_histogram.hpp" +#include "serial_tree_learner.h" +#include "data_partition.hpp" +#include "split_info.hpp" +#include "leaf_splits.hpp" + +#include +#include +#include +#include +#include + +#ifdef USE_CUDA + +#include +#include "cuda_kernel_launcher.h" // LGBM_CUDA +#include + + +using namespace json11; + +namespace LightGBM { + +/*! +* \brief CUDA-based parallel learning algorithm. +*/ +class CUDATreeLearner: public SerialTreeLearner { +public: + explicit CUDATreeLearner(const Config* tree_config); + ~CUDATreeLearner(); + // LGBM_CUDA: is_use_subset is used by CUDA only + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; + void ResetTrainingData(const Dataset* train_data) override; + Tree* Train(const score_t* gradients, const score_t *hessians, + bool is_constant_hessian, Json& forced_split_json) override; + + void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override { + SerialTreeLearner::SetBaggingData(used_indices, num_data); + // determine if we are using bagging before we construct the data partition + // thus we can start data movement to GPU earlier + if (used_indices != nullptr) { + if (num_data != num_data_) { + use_bagging_ = true; + return; + } + } + use_bagging_ = false; + } + +protected: + void BeforeTrain() override; + bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override; + void FindBestSplits() override; + void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; + void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; +private: + /*! \brief 4-byte feature tuple used by GPU kernels */ + //struct Feature4 { + // uint8_t s[4]; + //}; + + /*! \brief Single precision histogram entry for GPU */ + struct GPUHistogramBinEntry { + score_t sum_gradients; + score_t sum_hessians; + uint32_t cnt; + }; + + + /*! + * \brief Find the best number of workgroups processing one feature for maximizing efficiency + * \param leaf_num_data The number of data examples on the current leaf being processed + * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature + */ + int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data); + + /*! + * \brief Initialize GPU device + * \LGBM_CUDA: param num_gpu: number of maximum gpus + */ + void InitGPU(int num_gpu); + + /*! + * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only + */ + void CountDenseFeatureGroups(); // compute num_dense_feature_group + void prevAllocateGPUMemory(); // compute CPU-side param calculation & Pin HostMemory + void AllocateGPUMemory(); + + /*! + * \ LGBM_CUDA: ResetGPUMemory + */ + void ResetGPUMemory(); + + /*! + * \ LGBM_CUDA: copy dense feature from CPU to GPU + */ + void copyDenseFeature(); + + + /*! + * \brief Compute GPU feature histogram for the current leaf. + * Indices, gradients and hessians have been copied to the device. + * \param leaf_num_data Number of data on current leaf + * \param use_all_features Set to true to not use feature masks, with a faster kernel + */ + void GPUHistogram(data_size_t leaf_num_data, bool use_all_features); + + void SetThreadData(ThreadData* thread_data, int device_id, + int leaf_num_data, bool use_all_features, + int num_workgroups, int exp_workgroups_per_feature) { + ThreadData* td = &thread_data[device_id]; + td->device_id = device_id; + td->leaf_num_data = leaf_num_data; + td->num_data = num_data_; + td->use_all_features = use_all_features; + td->is_constant_hessian = is_constant_hessian_; + td->num_workgroups = num_workgroups; + td->stream = stream_[device_id]; + td->device_features = device_features_[device_id]; + td->device_feature_masks = reinterpret_cast(device_feature_masks_[device_id]); + td->device_data_indices = reinterpret_cast(device_data_indices_[device_id]); + td->device_gradients = device_gradients_[device_id]; + td->device_hessians = device_hessians_[device_id]; + td->hessians_const = hessians_[0]; + td->device_subhistograms = device_subhistograms_[device_id]; + td->sync_counters = sync_counters_[device_id]; + td->device_histogram_outputs= device_histogram_outputs_[device_id]; + td->exp_workgroups_per_feature = exp_workgroups_per_feature; + + td->kernel_start = &(kernel_start_[device_id]); + td->kernel_wait_obj = &(kernel_wait_obj_[device_id]); + td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]); + + size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; + size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; + td->output_size = output_size; + td->host_histogram_output = (char*)host_histogram_outputs_ + host_output_offset; + td->histograms_wait_obj = &(histograms_wait_obj_[device_id]); + } + + + // LGBM_CUDA: thread work + //typedef void * (*THREADFUNCPTR)(void *); + //void* launch_gpu_kernel(void *td); + + /*! + * \brief Wait for GPU kernel execution and read histogram + * \param histograms Destination of histogram results from GPU. + */ + template + void WaitAndGetHistograms(HistogramBinEntry* histograms); + + /*! + * \brief Construct GPU histogram asynchronously. + * Interface is similar to Dataset::ConstructHistograms(). + * \param is_feature_used A predicate vector for enabling each feature + * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU. + * Set to nullptr to skip copy to GPU. + * \param num_data Number of data examples to be included in histogram + * \param gradients Array of gradients for all examples. + * \param hessians Array of hessians for all examples. + * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, + * Set gradients to nullptr to skip copy to GPU. + * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, + * Set hessians to nullptr to skip copy to GPU. + * \return true if GPU kernel is launched, false if GPU is not used + */ + // LGBM_CUDA v5.2 + bool ConstructGPUHistogramsAsync( + const std::vector& is_feature_used, + const data_size_t* data_indices, data_size_t num_data); + + + /*! brief Log2 of max number of workgroups per feature*/ + const int kMaxLogWorkgroupsPerFeature = 10; // 2^10 + /*! brief Max total number of workgroups with preallocated workspace. + * If we use more than this number of workgroups, we have to reallocate subhistograms */ + //int preallocd_max_num_wg_ = 1024; + std::vector preallocd_max_num_wg_; + + /*! \brief True if bagging is used */ + bool use_bagging_; + + /*! \brief GPU device object */ + //int* dev_; + /*! \brief GPU command queue object */ + std::vector stream_; + + /*! \brief total number of feature-groups */ + int num_feature_groups_; + /*! \brief total number of dense feature-groups, which will be processed on GPU */ + int num_dense_feature_groups_; + std::vector num_gpu_feature_groups_; // LGBM_CUDA + std::vector offset_gpu_feature_groups_; // LGBM_CUDA + /*! \brief On GPU we read one DWORD (4-byte) of features of one example once. + * With bin size > 16, there are 4 features per DWORD. + * With bin size <=16, there are 8 features per DWORD. + * */ + int dword_features_; + /*! \brief total number of dense feature-group tuples on GPU. + * Each feature tuple is 4-byte (4 features if each feature takes a byte) */ + //int num_dense_feature4_; + /*! \brief Max number of bins of training data, used to determine + * which GPU kernel to use */ + int max_num_bin_; + /*! \brief Used GPU kernel bin size (64, 256) */ + int device_bin_size_; + /*! \brief Size of histogram bin entry, depending if single or double precision is used */ + size_t hist_bin_entry_sz_; + /*! \brief Indices of all dense feature-groups */ + std::vector dense_feature_group_map_; + /*! \brief Indices of all sparse feature-groups */ + std::vector sparse_feature_group_map_; + /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */ + //std::vector device_bin_mults_; + /*! \brief GPU memory object holding the training data */ + //uint8_t *device_features_; + std::vector device_features_; + /*! \brief GPU memory object holding the ordered gradient */ + //score_t *device_gradients_; + std::vector device_gradients_; + /*! \brief Pointer to pinned memory of ordered gradient */ + void * ptr_pinned_gradients_ = nullptr; + /*! \brief GPU memory object holding the ordered hessian */ + //score_t *device_hessians_; + std::vector device_hessians_; + /*! \brief Pointer to pinned memory of ordered hessian */ + void * ptr_pinned_hessians_ = nullptr; + /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */ +// std::vector> feature_masks_; + std::vector feature_masks_; + /*! \brief GPU memory object holding the feature masks */ + //void *device_feature_masks_; + std::vector device_feature_masks_; + /*! \brief Pointer to pinned memory of feature masks */ + char* ptr_pinned_feature_masks_ = nullptr; + /*! \brief GPU memory object holding indices of the leaf being processed */ + //data_size_t *device_data_indices_; + std::vector device_data_indices_; + /*! \brief GPU memory object holding counters for workgroup coordination */ + //int *sync_counters_; + std::vector sync_counters_; + /*! \brief GPU memory object holding temporary sub-histograms per workgroup */ + //char *device_subhistograms_; + std::vector device_subhistograms_; + /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */ + // FIXME: is this cuda mapped + //void *device_histogram_outputs_; + std::vector device_histogram_outputs_; + /*! \brief Host memory pointer for histogram outputs */ + void *host_histogram_outputs_; + /*! \LGBM_CUDA: CUDA waitlist object for waiting for data transfer before kernel execution */ + //cudaEvent_t kernel_wait_obj_; + std::vector kernel_wait_obj_; + /*! \LGBM_CUDA: CUDA waitlist object for reading output histograms after kernel execution */ + //cudaEvent_t histograms_wait_obj_; + std::vector histograms_wait_obj_; + /*! \LGBM_CUDA: CUDA Asynchronous waiting object for copying indices */ + //cudaEvent_t indices_future_; + std::vector indices_future_; + /*! \LGBM_CUDA: Asynchronous waiting object for copying gradients */ + //cudaEvent_t gradients_future_; + std::vector gradients_future_; + /*! \LGBM_CUDA: Asynchronous waiting object for copying hessians */ + //cudaEvent_t hessians_future_; + std::vector hessians_future_; + // LGBM_CUDA:\brief Asynchronous waiting object for copying dense features + //cudaEvent_t features_future_; + std::vector features_future_; + + // LGBM_CUDA: use subset of training data for bagging + bool is_use_subset_; + + // LGBM_CUDA: host-side buffer for converting feature data into featre4 data + //std::vector host_vecs_; + int nthreads_; // number of Feature4* vector on host4_vecs_ + //cudaEvent_t kernel_start_; // event for kernel start + std::vector kernel_start_; + std::vector kernel_time_; // measure histogram kernel time + std::vector> kernel_input_wait_time_; + int num_gpu_; + int allocated_num_data_; // allocated data instances + pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu +}; + +} // namespace LightGBM +#else // USE_CUDA + +// When GPU support is not compiled in, quit with an error message + +namespace LightGBM { + +class CUDATreeLearner: public SerialTreeLearner { +public: + #pragma warning(disable : 4702) + explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) { + Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" + "Please recompile with CMake option -DUSE_CUDA=1"); + } +}; + +} + +#endif //USE_CUDA +#endif // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ diff --git a/src/treelearner/kernels/histogram256.cu b/src/treelearner/kernels/histogram256.cu new file mode 100644 index 00000000000..5d659f8e2cf --- /dev/null +++ b/src/treelearner/kernels/histogram256.cu @@ -0,0 +1,372 @@ +/* + * ibmGBT: IBM CUDA Accelerated LightGBM + * + * IBM Confidential + * (C) Copyright IBM Corp. 2019. All Rights Reserved. + * + * The source code for this program is not published or otherwise + * divested of its trade secrets, irrespective of what has been + * deposited with the U.S. Copyright Office. + * + * US Government Users Restricted Rights - Use, duplication or + * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. + */ + +#include "histogram256.hu" +#include "stdio.h" + +#define PRINT(b,t,fmt,...) \ +if (b == gtid && t == ltid) { \ + printf(fmt, __VA_ARGS__); \ +} + + +#ifdef ENABLE_ALL_FEATURES +#ifdef IGNORE_INDICES +#define KERNEL_NAME histogram256_fulldata +#else // IGNORE_INDICES +#define KERNEL_NAME histogram256 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled +//#define KERNEL_NAME histogram256_allfeats +#endif // IGNORE_INDICES +#else // ENABLE_ALL_FEATURES +#error "ENABLE_ALL_FEATURES should always be 1" +#define KERNEL_NAME histogram256 +#endif // ENABLE_ALL_FEATURES + + +// atomic add for float number in local memory +inline __device__ void atomic_local_add_f(acc_type *addr, const float val) +{ + atomicAdd(addr, static_cast(val)); +} + +// this function will be called by histogram256 +// we have one sub-histogram of one feature in local memory, and need to read others +inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist, + const uint skip_id, + const uint old_val_cont_bin0, + const ushort num_sub_hist, + acc_type* __restrict__ output_buf, + acc_type* __restrict__ local_hist, + const size_t power_feature_workgroups) { + const ushort ltid = threadIdx.x; + // TODO: try to avoid bank conflict here + acc_type grad_bin = local_hist[ltid * 2]; + acc_type hess_bin = local_hist[ltid * 2 + 1]; + uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + + uint cont_bin; + if (power_feature_workgroups != 0) { + cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; + } else { + cont_bin = local_cnt[ltid]; + } + ushort i; + + if (power_feature_workgroups != 0) { + // add all sub-histograms for feature + const acc_type* __restrict__ p = feature_sub_hist + ltid; + for (i = 0; i < skip_id; ++i) { + grad_bin += *p; p += NUM_BINS; + hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; + } + + // skip the counters we already have + p += 3 * NUM_BINS; + + for (i = i + 1; i < num_sub_hist; ++i) { + grad_bin += *p; p += NUM_BINS; + hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; + } + } + __syncthreads(); + + + output_buf[ltid * 3 + 0] = grad_bin; + output_buf[ltid * 3 + 1] = hess_bin; + output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); +} + +#if USE_CONSTANT_BUF == 1 +__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, + __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))), + const data_size_t feature_size, + __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), + const data_size_t num_data, + __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), +#if CONST_HESSIAN == 0 + __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))), +#else + const score_t const_hessian, +#endif + char* __restrict__ output_buf, + volatile int * sync_counters, + acc_type* __restrict__ hist_buf_base, + const size_t power_feature_workgroups) { +#else +__global__ void KERNEL_NAME(const uchar* feature_data_base, + // FIXME: how to handle this __constant + const uchar* __restrict__ feature_masks, + const data_size_t feature_size, + const data_size_t* data_indices, + const data_size_t num_data, + const score_t* ordered_gradients, +#if CONST_HESSIAN == 0 + const score_t* ordered_hessians, +#else + const score_t const_hessian, +#endif + char* __restrict__ output_buf, + volatile int * sync_counters, + acc_type* __restrict__ hist_buf_base, + const size_t power_feature_workgroups) { +#endif + // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms + // otherwise a "Misaligned Address" exception may occur + __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; + const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; + const ushort ltid = threadIdx.x; + const ushort lsize = LOCAL_SIZE_0; // get_local_size(0); + const ushort group_id = blockIdx.x; + + // local memory per workgroup is 3 KB + // clear local memory + uint *ptr = (uint *) shared_array; + for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { + ptr[i] = 0; + } + __syncthreads(); + // gradient/hessian histograms + // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? + // total size: 2 * 256 * size_of(float) = 2 KB + // organization: each feature/grad/hessian is at a different bank, + // as indepedent of the feature value as possible + acc_type *gh_hist = (acc_type *)shared_array; + + // counter histogram + // total size: 256 * size_of(uint) = 1 KB + uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); + + // odd threads (1, 3, ...) compute histograms for hessians first + // even thread (0, 2, ...) compute histograms for gradients first + // etc. + uchar is_hessian_first = ltid & 1; + + ushort feature_id = group_id >> power_feature_workgroups; + + // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) + // feature_size is the number of examples per feature + const uchar *feature_data = feature_data_base + feature_id * feature_size; + + // size of threads that process this feature4 + const uint subglobal_size = lsize * (1 << power_feature_workgroups); + + // equavalent thread ID in this subgroup for this feature4 + const uint subglobal_tid = gtid - feature_id * subglobal_size; + + + data_size_t ind; + data_size_t ind_next; + #ifdef IGNORE_INDICES + ind = subglobal_tid; + #else + ind = data_indices[subglobal_tid]; + #endif + + // extract feature mask, when a byte is set to 0, that feature is disabled + uchar feature_mask = feature_masks[feature_id]; + // exit if the feature is masked + if (!feature_mask) { + return; + } else { + feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + } + + // STAGE 1: read feature data, and gradient and hessian + // first half of the threads read feature data from global memory + // We will prefetch data into the "next" variable at the beginning of each iteration + uchar feature; + uchar feature_next; + //uint8_t bin; + ushort bin; + + feature = feature_data[ind >> feature_mask]; + if (feature_mask) { + feature = (feature >> ((ind & 1) << 2)) & 0xf; + } + bin = feature; + acc_type grad_bin = 0.0f, hess_bin = 0.0f; + acc_type *addr_bin; + + // store gradient and hessian + score_t grad, hess; + score_t grad_next, hess_next; + // LGBM_CUDA v5.1 + grad = ordered_gradients[ind]; + #if CONST_HESSIAN == 0 + hess = ordered_hessians[ind]; + #endif + + + // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 + for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { + // prefetch the next iteration variables + // we don't need bondary check because we have made the buffer large + #ifdef IGNORE_INDICES + // we need to check to bounds here + ind_next = i + subglobal_size < num_data ? i + subglobal_size : i; + #else + ind_next = data_indices[i + subglobal_size]; + #endif + + // imbGBT v5.1 + grad_next = ordered_gradients[ind_next]; + #if CONST_HESSIAN == 0 + hess_next = ordered_hessians[ind_next]; + #endif + + // STAGE 2: accumulate gradient and hessian + if (bin != feature) { + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + + bin = feature; + grad_bin = grad; + hess_bin = hess; + } + else { + grad_bin += grad; + hess_bin += hess; + } + + // prefetch the next iteration variables + feature_next = feature_data[ind_next >> feature_mask]; + + // STAGE 3: accumulate counter + atomicAdd(cnt_hist + feature, 1); + + // STAGE 4: update next stat + grad = grad_next; + hess = hess_next; + // LGBM_CUDA: v4.2 + if (!feature_mask) { + feature = feature_next; + } else { + feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; + } + } + + + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + __syncthreads(); + + #if CONST_HESSIAN == 1 + // make a final reduction + gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + __syncthreads(); + #endif + +#if POWER_FEATURE_WORKGROUPS != 0 + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; + // write gradients and hessians + acc_type *__restrict__ ptr_f = output; + for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { + // even threads read gradients, odd threads read hessians + // FIXME: 2-way bank conflict + acc_type value = gh_hist[i]; + ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; + } + // write counts + acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } + // FIXME: is this right + __syncthreads(); + __threadfence(); + // To avoid the cost of an extra reducting kernel, we have to deal with some + // gray area in OpenCL. We want the last work group that process this feature to + // make the final reduction, and other threads will just quit. + // This requires that the results written by other workgroups available to the + // last workgroup (memory consistency) + #if NVIDIA == 1 + // this is equavalent to CUDA __threadfence(); + // ensure the writes above goes to main memory and other workgroups can see it + asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); + #else + // FIXME: how to do the above on AMD GPUs?? + // GCN ISA says that the all writes will bypass L1 cache (write through), + // however when the last thread is reading sub-histogram data we have to + // make sure that no part of data is modified in local L1 cache of other workgroups. + // Otherwise reading can be a problem (atomic operations to get consistency). + // But in our case, the sub-histogram of this workgroup cannot be in the cache + // of another workgroup, so the following trick will work just fine. + #endif + // Now, we want one workgroup to do the final reduction. + // Other workgroups processing the same feature quit. + // The is done by using an global atomic counter. + // On AMD GPUs ideally this should be done in GDS, + // but currently there is no easy way to access it via OpenCL. + uint * counter_val = cnt_hist; + // backup the old value + uint old_val = *counter_val; + if (ltid == 0) { + // all workgroups processing the same feature add this counter + *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); + } + // make sure everyone in this workgroup is here + __syncthreads(); + // everyone in this wrokgroup: if we are the last workgroup, then do reduction! + if (*counter_val == (1 << power_feature_workgroups) - 1) { + if (ltid == 0) { + sync_counters[feature_id] = 0; + } + //} + #else + } + // only 1 work group, no need to increase counter + // the reduction will become a simple copy + if (1) { + uint old_val; // dummy + #endif + // locate our feature's block in output memory + uint output_offset = (feature_id << power_feature_workgroups); + acc_type const * __restrict__ feature_subhists = + (acc_type *)output_buf + output_offset * 3 * NUM_BINS; + // skip reading the data already in local memory + //uint skip_id = feature_id ^ output_offset; + uint skip_id = group_id - output_offset; + // locate output histogram location for this feature4 + acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS; + + + within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); + } +} + diff --git a/src/treelearner/kernels/histogram256.hu b/src/treelearner/kernels/histogram256.hu new file mode 100644 index 00000000000..145a85367f0 --- /dev/null +++ b/src/treelearner/kernels/histogram256.hu @@ -0,0 +1,179 @@ +/* + * ibmGBT: IBM CUDA Accelerated LightGBM + * + * IBM Confidential + * (C) Copyright IBM Corp. 2019. All Rights Reserved. + * + * The source code for this program is not published or otherwise + * divested of its trade secrets, irrespective of what has been + * deposited with the U.S. Copyright Office. + * + * US Government Users Restricted Rights - Use, duplication or + * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. + */ + +#ifndef _HISTOGRAM_256_KERNEL_ +#define _HISTOGRAM_256_KERNEL_ + +//#pragma once + +// use double precision or not +#ifndef USE_DP_FLOAT +#define USE_DP_FLOAT 1 +#endif + +// ignore hessian, and use the local memory for hessian as an additional bank for gradient +#ifndef CONST_HESSIAN +#define CONST_HESSIAN 0 +#endif + +typedef unsigned char uchar; + +template +__device__ double as_double(const T t) { + static_assert(sizeof(T) == sizeof(double), "size mismatch"); + double d; + memcpy(&d, &t, sizeof(T)); + return d; +} +template +__device__ ulong as_ulong(const T t) { + static_assert(sizeof(T) == sizeof(ulong), "size mismatch"); + ulong u; + memcpy(&u, &t, sizeof(T)); + return u; +} +template +__device__ float as_float(const T t) { + static_assert(sizeof(T) == sizeof(float), "size mismatch"); + float f; + memcpy(&f, &t, sizeof(T)); + return f; +} +template +__device__ uint as_uint(const T t) { + static_assert(sizeof(T) == sizeof(uint), "size_mismatch"); + uint u; + memcpy(&u, &t, sizeof(T)); + return u; +} +template +__device__ uchar4 as_uchar4(const T t) { + static_assert(sizeof(T) == sizeof(uchar4), "size mismatch"); + uchar4 u; + memcpy(&u, &t, sizeof(T)); + return u; +} + + +#define LOCAL_SIZE_0 256 +#define NUM_BINS 256 +#if USE_DP_FLOAT == 1 +typedef double acc_type; +typedef ulong acc_int_type; +#define as_acc_type as_double +#define as_acc_int_type as_ulong +#else +typedef float acc_type; +typedef uint acc_int_type; +#define as_acc_type as_float +#define as_acc_int_type as_uint +#endif +//#define LOCAL_MEM_SIZE (4 * (sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) +#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) + +// unroll the atomic operation for a few times. Takes more code space, +// but compiler can generate better code for faster atomics. +#define UNROLL_ATOMIC 1 + +// Options passed by compiler at run time: +// IGNORE_INDICES will be set when the kernel does not +//#define IGNORE_INDICES +//#define POWER_FEATURE_WORKGROUPS 10 + +// detect Nvidia platforms +#ifdef cl_nv_pragma_unroll +#define NVIDIA 1 +#endif + +// use all features and do not use feature mask +#ifndef ENABLE_ALL_FEATURES +#define ENABLE_ALL_FEATURES 1 +#endif + +// use binary patching for AMD GCN 1.2 or newer +#ifndef AMD_USE_DS_ADD_F32 +#define AMD_USE_DS_ADD_F32 0 +#endif + +typedef uint data_size_t; +typedef float score_t; + + +// define all of the different kernels + +#define DECLARE_CONST_BUF(name) \ +__global__ void name(__global const uchar* restrict feature_data_base, \ + const uchar* restrict feature_masks,\ + const data_size_t feature_size,\ + const data_size_t* restrict data_indices, \ + const data_size_t num_data, \ + const score_t* restrict ordered_gradients, \ + const score_t* restrict ordered_hessians,\ + char* __restrict__ output_buf,\ + volatile int * sync_counters,\ + acc_type* __restrict__ hist_buf_base, \ + const size_t power_feature_workgroups); + + +#define DECLARE_CONST_HES_CONST_BUF(name) \ +__global__ void name(const uchar* __restrict__ feature_data_base, \ + const uchar* __restrict__ feature_masks,\ + const data_size_t feature_size,\ + const data_size_t* __restrict__ data_indices, \ + const data_size_t num_data, \ + const score_t* __restrict__ ordered_gradients, \ + const score_t const_hessian,\ + char* __restrict__ output_buf,\ + volatile int * sync_counters,\ + acc_type* __restrict__ hist_buf_base, \ + const size_t power_feature_workgroups); + + + +#define DECLARE_CONST_HES(name) \ +__global__ void name(const uchar* feature_data_base, \ + const uchar* __restrict__ feature_masks,\ + const data_size_t feature_size,\ + const data_size_t* data_indices, \ + const data_size_t num_data, \ + const score_t* ordered_gradients, \ + const score_t const_hessian,\ + char* __restrict__ output_buf, \ + volatile int * sync_counters,\ + acc_type* __restrict__ hist_buf_base, \ + const size_t power_feature_workgroups); + + +#define DECLARE(name) \ +__global__ void name(const uchar* feature_data_base, \ + const uchar* __restrict__ feature_masks,\ + const data_size_t feature_size,\ + const data_size_t* data_indices, \ + const data_size_t num_data, \ + const score_t* ordered_gradients, \ + const score_t* ordered_hessians,\ + char* __restrict__ output_buf, \ + volatile int * sync_counters,\ + acc_type* __restrict__ hist_buf_base, \ + const size_t power_feature_workgroups); + + +DECLARE_CONST_HES(histogram256_allfeats); +DECLARE_CONST_HES(histogram256_fulldata); +DECLARE_CONST_HES(histogram256); +DECLARE(histogram256_allfeats); +DECLARE(histogram256_fulldata); +DECLARE(histogram256); + +#endif // _HITOGRAM_256_KERNEL_ From fc981bf68e13dea264a95cfa4c081dd90b1faf5d Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 30 Mar 2020 17:08:38 +0000 Subject: [PATCH 003/119] Initial CUDA work --- src/boosting/gbdt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 5f7aac08640..a10f77f574d 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -276,7 +276,7 @@ void GBDT::Bagging(int iter) { tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); - tree_learner_->ResetTrainingData(tmp_subset_.get()); + tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_ ); } } } From 581ce4a5fa38b3aa4b8bd5c5076716efe974de38 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 30 Mar 2020 17:35:27 +0000 Subject: [PATCH 004/119] Initial CUDA work --- src/treelearner/parallel_tree_learner.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 35ac432eba3..2fdf542d421 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -14,6 +14,7 @@ #include "gpu_tree_learner.h" #include "serial_tree_learner.h" +#include "cuda_tree_learner.h" namespace LightGBM { From 3f98c73e80b55721e08868d4718a1b9c7eb1a5e8 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 30 Mar 2020 21:18:25 +0000 Subject: [PATCH 005/119] Initial CUDA work --- src/boosting/gbdt.cpp | 12 ++++++++++-- src/io/dense_nbits_bin.hpp | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index a10f77f574d..58a1976d96b 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -96,7 +96,15 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get())); // init tree learner - tree_learner_->Init(train_data_, is_constant_hessian_); + // LGBM_CUDA do not copy feature is is_use_subset for initialization + // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_) + + if (config_->device_type == std::string("cuda")) { + tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_); + } else { + tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_); + } + tree_learner_->SetForcedSplit(&forced_splits_json_); // push training metrics @@ -274,7 +282,7 @@ void GBDT::Bagging(int iter) { tmp_hessians_.resize(bag_gh_size); } - tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); + tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_ ); } diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp index adf99115626..89b9159b390 100644 --- a/src/io/dense_nbits_bin.hpp +++ b/src/io/dense_nbits_bin.hpp @@ -346,7 +346,7 @@ class Dense4bitsBin : public Bin { } } - void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { + void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { auto other_bin = dynamic_cast(full_bin); const data_size_t rest = num_used_indices & 1; for (int i = 0; i < num_used_indices - rest; i += 2) { From 2ec9a412dcfb13d72e87f93611b2c554b665eaad Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 30 Mar 2020 22:07:12 +0000 Subject: [PATCH 006/119] Initial CUDA work --- include/LightGBM/tree_learner.h | 3 ++- src/boosting/gbdt.cpp | 3 ++- src/boosting/rf.hpp | 2 +- src/treelearner/gpu_tree_learner.h | 3 ++- src/treelearner/serial_tree_learner.cpp | 20 ++++++++++++++------ src/treelearner/serial_tree_learner.h | 9 ++++++--- 6 files changed, 27 insertions(+), 13 deletions(-) diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index 3bc246e8426..6c549a5ed71 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -54,9 +54,10 @@ class TreeLearner { * \brief training tree model on dataset * \param gradients The first order gradients * \param hessians The second order gradients + * \param is_constant_hessian True if all hessians share the same value * \return A trained tree */ - virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0; + virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, Json& forced_split_json) = 0; /*! * \brief use an existing tree to fit the new gradients and hessians. diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 58a1976d96b..a4f653bfa54 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -546,7 +546,8 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { grad = gradients_.data() + offset; hess = hessians_.data() + offset; } - new_tree.reset(tree_learner_->Train(grad, hess)); + // LGBM_CUDA + new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_)); } if (new_tree->num_leaves() > 1) { diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 5c90202a515..e64bf6cb4d8 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -125,7 +125,7 @@ class RF : public GBDT { hess = tmp_hess_.data(); } - new_tree.reset(tree_learner_->Train(grad, hess)); + new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_)); } if (new_tree->num_leaves() > 1) { diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h index a909c57cbad..ba48f030441 100644 --- a/src/treelearner/gpu_tree_learner.h +++ b/src/treelearner/gpu_tree_learner.h @@ -48,7 +48,8 @@ class GPUTreeLearner: public SerialTreeLearner { void Init(const Dataset* train_data, bool is_constant_hessian) override; void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; void ResetIsConstantHessian(bool is_constant_hessian) override; - Tree* Train(const score_t* gradients, const score_t *hessians) override; + Tree* Train(const score_t* gradients, const score_t *hessians, + bool is_constant_hessian, Json& forced_split_json) override; void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 6b02411127a..5d2b9afff50 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -30,6 +30,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian train_data_ = train_data; num_data_ = train_data_->num_data(); num_features_ = train_data_->num_features(); + is_constant_hessian_ = is_constant_hessian; int max_cache_size = 0; // Get the max size of pool if (config_->histogram_pool_size <= 0) { @@ -148,10 +149,11 @@ void SerialTreeLearner::ResetConfig(const Config* config) { constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves)); } -Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) { +Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) { Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer); gradients_ = gradients; hessians_ = hessians; + is_constant_hessian_ = is_constant_hessian; int num_threads = OMP_NUM_THREADS(); if (share_state_->num_threads != num_threads && share_state_->num_threads > 0) { Log::Warning( @@ -175,7 +177,12 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians // only root leaf can be splitted on first time int right_leaf = -1; - int init_splits = ForceSplits(tree_prt, &left_leaf, &right_leaf, &cur_depth); + int init_splits = 0; + bool aborted_last_force_split = false; + if (!forced_split_json.is_null()) { + init_splits = ForceSplits(tree.get(), forced_split_json, &left_leaf, + &right_leaf, &cur_depth, &aborted_last_force_split); + } for (int split = init_splits; split < config_->num_leaves - 1; ++split) { // some initial works before finding best split @@ -434,8 +441,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } } -int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf, - int* right_leaf, int *cur_depth) { +int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf, + int* right_leaf, int *cur_depth, + bool *aborted_last_force_split) { bool abort_last_forced_split = false; if (forced_split_json_ == nullptr) { return 0; @@ -444,11 +452,11 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf, // start at root leaf *left_leaf = 0; std::queue> q; - Json left = *forced_split_json_; + Json left = forced_split_json; Json right; bool left_smaller = true; std::unordered_map forceSplitMap; - q.push(std::make_pair(left, *left_leaf)); + q.push(std::make_pair(forced_split_json, *left_leaf)); while (!q.empty()) { // before processing next node from queue, store info for current left/right leaf // store "best split" for left and right, even if they might be overwritten by forced split diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index fab28542e03..367c262192c 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -79,7 +79,8 @@ class SerialTreeLearner: public TreeLearner { } } - Tree* Train(const score_t* gradients, const score_t *hessians) override; + Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, + Json& forced_split_json) override; Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override; @@ -162,8 +163,9 @@ class SerialTreeLearner: public TreeLearner { bool update_cnt); /* Force splits with forced_split_json dict and then return num splits forced.*/ - int32_t ForceSplits(Tree* tree, int* left_leaf, int* right_leaf, - int* cur_depth); + virtual int32_t ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf, + int* right_leaf, int* cur_depth, + bool *aborted_last_force_split); /*! * \brief Get the number of data in a leaf @@ -226,6 +228,7 @@ class SerialTreeLearner: public TreeLearner { const Json* forced_split_json_; std::unique_ptr share_state_; std::unique_ptr cegb_; + bool is_constant_hessian_; }; inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const { From 1023182a5ca5a0699f90477dad7c96c4f234575a Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 31 Mar 2020 17:10:42 +0000 Subject: [PATCH 007/119] Initial CUDA work --- build_LGBM.232.sh | 3 +-- src/boosting/gbdt.cpp | 2 +- src/io/dense_nbits_bin.hpp | 8 +++---- src/treelearner/cuda_tree_learner.cpp | 32 +++++++++++++-------------- src/treelearner/cuda_tree_learner.h | 12 +++------- 5 files changed, 25 insertions(+), 32 deletions(-) diff --git a/build_LGBM.232.sh b/build_LGBM.232.sh index 24b50c7dfda..5e500327108 100755 --- a/build_LGBM.232.sh +++ b/build_LGBM.232.sh @@ -2,6 +2,5 @@ rm -rf build mkdir build cd build -#cmake -DUSE_CUDA=1 .. -cmake .. +cmake -DUSE_CUDA=1 .. make -j40 diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index a4f653bfa54..580c52fd889 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -284,7 +284,7 @@ void GBDT::Bagging(int iter) { tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); - tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_ ); + tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_); } } } diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp index 89b9159b390..5eb215fad8a 100644 --- a/src/io/dense_nbits_bin.hpp +++ b/src/io/dense_nbits_bin.hpp @@ -75,7 +75,7 @@ class Dense4bitsBin : public Bin { void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, const score_t* ordered_gradients, const score_t* ordered_hessians, - HistogramBinEntry* out) const override { + hist_t* out) const override { const data_size_t rest = num_data & 0x3; data_size_t i = 0; for (; i < num_data - rest; i += 4) { @@ -118,7 +118,7 @@ class Dense4bitsBin : public Bin { void ConstructHistogram(data_size_t num_data, const score_t* ordered_gradients, const score_t* ordered_hessians, - HistogramBinEntry* out) const override { + hist_t* out) const override { const data_size_t rest = num_data & 0x3; data_size_t i = 0; @@ -153,7 +153,7 @@ class Dense4bitsBin : public Bin { void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, const score_t* ordered_gradients, - HistogramBinEntry* out) const override { + hist_t* out) const override { const data_size_t rest = num_data & 0x3; data_size_t i = 0; for (; i < num_data - rest; i += 4) { @@ -190,7 +190,7 @@ class Dense4bitsBin : public Bin { void ConstructHistogram(data_size_t num_data, const score_t* ordered_gradients, - HistogramBinEntry* out) const override { + hist_t* out) const override { const data_size_t rest = num_data & 0x3; data_size_t i = 0; for (; i < num_data - rest; i += 4) { diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index c45df55cacf..d2064843ead 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -83,7 +83,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, // some functions used for debugging the GPU histogram construction -void PrintHistograms(HistogramBinEntry* h, size_t size) { +void PrintHistograms(hist_t* h, size_t size) { size_t total = 0; for (size_t i = 0; i < size; ++i) { printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt); @@ -104,7 +104,7 @@ union Float_t }; -void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) { +void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) { size_t i; Float_t a, b; @@ -221,7 +221,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu template -void CUDATreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { +void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) { HistType* hist_outputs = (HistType*) host_histogram_outputs_; //#pragma omp parallel for schedule(static, num_gpu_) @@ -321,7 +321,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_); // histogram bin entry size depends on the precision (single/double) - hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry); + hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t); // host_size histogram outputs // host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_); @@ -591,12 +591,12 @@ Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, return ret; } -void CUDATreeLearner::ResetTrainingData(const Dataset* train_data) { +void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) { // LGBM_CUDA: check data size data_size_t old_num_data = num_data_; - SerialTreeLearner::ResetTrainingData(train_data); + SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin); #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA serial_time = std::chrono::steady_clock::now() - start_serial_time; @@ -889,7 +889,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } // construct smaller leaf - HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; + hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; // Check workgroups per feature4 tuple.. int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); @@ -918,11 +918,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ if (is_gpu_used) { if (config_->gpu_use_dp) { // use double precision - WaitAndGetHistograms(ptr_smaller_leaf_hist_data); + WaitAndGetHistograms(ptr_smaller_leaf_hist_data); } else { // use single precision - WaitAndGetHistograms(ptr_smaller_leaf_hist_data); + WaitAndGetHistograms(ptr_smaller_leaf_hist_data); } } @@ -936,13 +936,13 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ continue; int dense_feature_group_index = dense_feature_group_map_[i]; size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index); - HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; - HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index); - HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size]; + hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; + hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index); + hist_t* gpu_histogram = new hist_t[size]; data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); std::copy(current_histogram, current_histogram + size, gpu_histogram); - std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry)); + std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(hist_t)); if ( num_data == num_data_ ) { if ( is_constant_hessian_ ) { printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_"); @@ -991,7 +991,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // construct larger leaf - HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1; + hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1; is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf()); @@ -1013,11 +1013,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ if (is_gpu_used) { if (config_->gpu_use_dp) { // use double precision - WaitAndGetHistograms(ptr_larger_leaf_hist_data); + WaitAndGetHistograms(ptr_larger_leaf_hist_data); } else { // use single precision - WaitAndGetHistograms(ptr_larger_leaf_hist_data); + WaitAndGetHistograms(ptr_larger_leaf_hist_data); } } } diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index e5a24aeb8f5..8a0e5d7cb20 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -39,7 +39,7 @@ class CUDATreeLearner: public SerialTreeLearner { ~CUDATreeLearner(); // LGBM_CUDA: is_use_subset is used by CUDA only void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; - void ResetTrainingData(const Dataset* train_data) override; + void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) override; @@ -68,13 +68,7 @@ class CUDATreeLearner: public SerialTreeLearner { // uint8_t s[4]; //}; - /*! \brief Single precision histogram entry for GPU */ - struct GPUHistogramBinEntry { - score_t sum_gradients; - score_t sum_hessians; - uint32_t cnt; - }; - + typedef float gpu_hist_t; /*! * \brief Find the best number of workgroups processing one feature for maximizing efficiency @@ -158,7 +152,7 @@ class CUDATreeLearner: public SerialTreeLearner { * \param histograms Destination of histogram results from GPU. */ template - void WaitAndGetHistograms(HistogramBinEntry* histograms); + void WaitAndGetHistograms(hist_t* histograms); /*! * \brief Construct GPU histogram asynchronously. From e50e4be629b064306b9771073e6f435e5e225da1 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 31 Mar 2020 18:18:43 +0000 Subject: [PATCH 008/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 8a0e5d7cb20..93bbdc483b7 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -43,11 +43,11 @@ class CUDATreeLearner: public SerialTreeLearner { Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) override; - void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override { - SerialTreeLearner::SetBaggingData(used_indices, num_data); + void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { + SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); // determine if we are using bagging before we construct the data partition // thus we can start data movement to GPU earlier - if (used_indices != nullptr) { + if (subset == nullptr && used_indices != nullptr) { if (num_data != num_data_) { use_bagging_ = true; return; From 3d6201801911ba041d5ea9bdec55efe3d37cd8e3 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 31 Mar 2020 18:33:44 +0000 Subject: [PATCH 009/119] Initial CUDA work --- src/io/dense_nbits_bin.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp index 5eb215fad8a..85ea5b311d2 100644 --- a/src/io/dense_nbits_bin.hpp +++ b/src/io/dense_nbits_bin.hpp @@ -310,7 +310,7 @@ class Dense4bitsBin : public Bin { void* get_data() override { return data_.data(); } /*! \brief not ordered bin for dense feature */ - OrderedBin* CreateOrderedBin() const override { return nullptr; } + Bin* CreateDenseBin() const { return nullptr; } void FinishLoad() override { if (buf_.empty()) { return; } From 7a6bf3321365ce8b8b4c354f9cae065956ab0045 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 31 Mar 2020 22:26:11 +0000 Subject: [PATCH 010/119] Initial CUDA work --- CMakeLists.txt | 81 ++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 55 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d81828b640..104339756ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,53 +184,30 @@ if(USE_CUDA) message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}) message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES}) - add_library(histo256_sp_const OBJECT src/treelearner/kernels/histogram256.cu) - set_target_properties(histo256_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - target_compile_definitions( - histo256_sp_const PRIVATE - -DCONST_HESSIAN=1 - ${BASE_DEFINES} - ) - - add_library(histo256_sp OBJECT src/treelearner/kernels/histogram256.cu) - set_target_properties(histo256_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - target_compile_definitions( - histo256_sp PRIVATE - -DCONST_HESSIAN=0 - ${BASE_DEFINES} - ) - - add_library(histo256-allfeats_sp_const OBJECT src/treelearner/kernels/histogram256.cu) - set_target_properties(histo256-allfeats_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - target_compile_definitions( - histo256-allfeats_sp_const PRIVATE - -DCONST_HESSIAN=1 - ${ALLFEATS_DEFINES} - ) + function(add_histogram hsize hname hadd hconst hdir) + add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu) + set_target_properties(histo${hsize}${hname} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + if(hadd) + list(APPEND histograms histo${hsize}${hname}) + set(histograms ${histograms} PARENT_SCOPE) + endif() + target_compile_definitions( + histo${hsize}${hname} PRIVATE + -DCONST_HESSIAN=${hconst} + ${hdir} + ) + endfunction() + + #foreach (hsize 16 64 256) + foreach (hsize 256) + add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}") + add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}") + add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}") + add_histogram("${hsize}" "-allfeats_sp" "False" "0" "${ALLFEATS_DEFINES}") + add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}") + add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}") + endforeach() - add_library(histo256-allfeats_sp OBJECT src/treelearner/kernels/histogram256.cu) - set_target_properties(histo256-allfeats_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - target_compile_definitions( - histo256-allfeats_sp PRIVATE - -DCONST_HESSIAN=0 - ${ALLFEATS_DEFINES} - ) - - add_library(histo256-fulldata_sp_const OBJECT src/treelearner/kernels/histogram256.cu) - set_target_properties(histo256-fulldata_sp_const PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - target_compile_definitions( - histo256-fulldata_sp_const PRIVATE - -DCONST_HESSIAN=1 - ${FULLDATA_DEFINES} - ) - - add_library(histo256-fulldata_sp OBJECT src/treelearner/kernels/histogram256.cu) - set_target_properties(histo256-fulldata_sp PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - target_compile_definitions( - histo256-fulldata_sp PRIVATE - -DCONST_HESSIAN=0 - ${FULLDATA_DEFINES} - ) endif(USE_CUDA) if(USE_HDFS) @@ -340,7 +317,7 @@ file(GLOB SOURCES src/network/*.cpp src/treelearner/*.cpp #ifdef USE_CUDA - src/treelearner/*cu + src/treelearner/*.cu #endif ) @@ -417,17 +394,11 @@ endif(USE_GPU) if(USE_CUDA) TARGET_LINK_LIBRARIES( lightgbm - histo256_sp_const - histo256_sp - histo256-fulldata_sp_const - histo256-fulldata_sp + ${histograms} ) TARGET_LINK_LIBRARIES( _lightgbm - histo256_sp_const - histo256_sp - histo256-fulldata_sp_const - histo256-fulldata_sp + ${histograms} ) endif(USE_CUDA) From 01b32269366888980303eb43b0edb2c2245e284c Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 1 Apr 2020 16:50:09 +0000 Subject: [PATCH 011/119] Initial CUDA work --- CMakeLists.txt | 11 ++--------- python-package/setup.py | 21 ++++++++++++++++++--- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 104339756ce..dc87f65bd67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,3 @@ -#LGBM_CUDA Added USE_CUDA flag if(USE_GPU OR APPLE) cmake_minimum_required(VERSION 3.2) elseif(USE_CUDA) @@ -8,7 +7,6 @@ else() cmake_minimum_required(VERSION 2.8) endif() -#LGBM_CUDA if(USE_CUDA) PROJECT(lightgbm LANGUAGES C CXX CUDA) else() @@ -23,7 +21,8 @@ OPTION(USE_GPU "Enable GPU-accelerated training" OFF) OPTION(USE_SWIG "Enable SWIG to generate Java API" OFF) OPTION(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF) OPTION(USE_R35 "Set to ON if your R version is not earlier than 3.5" OFF) -OPTION(USE_CUDA "Enable CUDA-accelerated training" OFF) # LGBM_CUDA +OPTION(USE_TIMETAG "Set to ON to output time costs" OFF) +OPTION(USE_CUDA "Enable CUDA-accelerated training" OFF) OPTION(USE_DEBUG "Set to ON for Debug mode" OFF) OPTION(BUILD_FOR_R "Set to ON if building lib_lightgbm for use with the R package" OFF) @@ -138,7 +137,6 @@ if(USE_GPU) ADD_DEFINITIONS(-DUSE_GPU) endif(USE_GPU) -#LGBM_CUDA CUDA-specific code if(USE_CUDA) find_package(CUDA REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) @@ -166,12 +164,10 @@ if(USE_CUDA) -DPOWER_FEATURE_WORKGROUPS=12 -DUSE_CONSTANT_BUF=0 ) - set(ALLFEATS_DEFINES ${BASE_DEFINES} -DENABLE_ALL_FEATURES ) - set(FULLDATA_DEFINES ${ALLFEATS_DEFINES} -DIGNORE_INDICES @@ -180,7 +176,6 @@ if(USE_CUDA) #string(REPLACE ";" " " BASE_DEFINES "${BASE_DEFINES}") #string(REPLACE ";" " " ALLFEATS_DEFINES "${ALLFEATS_DEFINES}") #string(REPLACE ";" " " FULLDATA_DEFINES "${FULLDATA_DEFINES}") - message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}) message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES}) @@ -307,7 +302,6 @@ if(USE_MPI) include_directories(${MPI_CXX_INCLUDE_PATH}) endif(USE_MPI) -#LGBM_CUDA file(GLOB SOURCES src/application/*.cpp src/boosting/*.cpp @@ -390,7 +384,6 @@ if(USE_GPU) TARGET_LINK_LIBRARIES(_lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES}) endif(USE_GPU) -#LGBM_CUDA if(USE_CUDA) TARGET_LINK_LIBRARIES( lightgbm diff --git a/python-package/setup.py b/python-package/setup.py index 9d8853ddf94..f66475cb27d 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -86,10 +86,11 @@ def silent_call(cmd, raise_error=False, error_msg=''): return 1 -def compile_cpp(use_mingw=False, use_gpu=False, use_mpi=False, +def compile_cpp(use_mingw=False, use_gpu=False, use_cuda=False, use_mpi=False, use_hdfs=False, boost_root=None, boost_dir=None, boost_include_dir=None, boost_librarydir=None, opencl_include_dir=None, opencl_library=None, + openmp_include_dir=None, openmp_library=None, nomp=False, bit32=False): if os.path.exists(os.path.join(CURRENT_DIR, "build_cpp")): @@ -114,6 +115,12 @@ def compile_cpp(use_mingw=False, use_gpu=False, use_mpi=False, cmake_cmd.append("-DOpenCL_INCLUDE_DIR={0}".format(opencl_include_dir)) if opencl_library: cmake_cmd.append("-DOpenCL_LIBRARY={0}".format(opencl_library)) + elif use_cuda: + cmake_cmd.append("-DUSE_CUDA=ON") + if openmp_include_dir: + cmake_cmd.append("-DOpenMP_INCLUDE_DIR={0}".format(openmp_include_dir)) + if openmp_library: + cmake_cmd.append("-DOpenMP_LIBRARY={0}".format(openmp_library)) if use_mpi: cmake_cmd.append("-DUSE_MPI=ON") if nomp: @@ -187,6 +194,7 @@ class CustomInstall(install): user_options = install.user_options + [ ('mingw', 'm', 'Compile with MinGW'), ('gpu', 'g', 'Compile GPU version'), + ('cuda', 'c', 'Compile CUDA version'), ('mpi', None, 'Compile MPI version'), ('nomp', None, 'Compile version without OpenMP support'), ('hdfs', 'h', 'Compile HDFS version'), @@ -197,21 +205,27 @@ class CustomInstall(install): ('boost-include-dir=', None, 'Directory containing Boost headers'), ('boost-librarydir=', None, 'Preferred Boost library directory'), ('opencl-include-dir=', None, 'OpenCL include directory'), - ('opencl-library=', None, 'Path to OpenCL library') + ('opencl-library=', None, 'Path to OpenCL library'), + ('openmp-include-dir=', None, 'OpenMP include directory'), + ('openmp-library=', None, 'Path to OpenMP library') ] def initialize_options(self): install.initialize_options(self) self.mingw = 0 self.gpu = 0 + self.cuda = 0 self.boost_root = None self.boost_dir = None self.boost_include_dir = None self.boost_librarydir = None self.opencl_include_dir = None self.opencl_library = None + self.openmp_include_dir = None + self.openmp_library = None self.mpi = 0 self.hdfs = 0 + #self.precompile = 0 #TODO: revert this self.precompile = 1 self.nomp = 0 self.bit32 = 0 @@ -227,10 +241,11 @@ def run(self): open(LOG_PATH, 'wb').close() if not self.precompile: copy_files(use_gpu=self.gpu) - compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_mpi=self.mpi, + compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi, use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir, boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir, opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library, + openmp_include_dir=self.openmp_include_dir, openmp_library=self.openmp_library, nomp=self.nomp, bit32=self.bit32) install.run(self) if os.path.isfile(LOG_PATH): From 64dbb6b844d6598d5db83d729e1067a17e39942f Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 2 Apr 2020 13:09:52 +0000 Subject: [PATCH 012/119] Initial CUDA work --- include/LightGBM/c_api.h | 6 ++ python-package/lightgbm/__init__.py | 3 +- python-package/lightgbm/basic.py | 5 ++ python-package/setup.py | 2 +- src/c_api.cpp | 10 +++ tests/python_package_test/test_basic.py | 2 + tests/python_package_test/test_engine.py | 89 +++++++++++++++++++++++- 7 files changed, 114 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 9d7c6e61dd2..3fbccdac075 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -1076,6 +1076,12 @@ LIGHTGBM_C_EXPORT int LGBM_NetworkInitWithFunctions(int num_machines, #define THREAD_LOCAL thread_local /*!< \brief Thread local specifier. */ #endif +/*! + * * \brief Returns device type. + * * \return 0 = CPU, 1 = GPU / OCL, 2 = CUDA + * */ +LIGHTGBM_C_EXPORT int LGBM_GetDeviceType(); + /*! * \brief Handle of error message. * \return Error message diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index 390a6994a7a..44a56ae03f5 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -5,7 +5,7 @@ """ from __future__ import absolute_import -from .basic import Booster, Dataset +from .basic import Booster, Dataset, get_device_type from .callback import (early_stopping, print_evaluation, record_evaluation, reset_parameter) from .engine import cv, train @@ -30,6 +30,7 @@ __version__ = version_file.read().strip() __all__ = ['Dataset', 'Booster', + 'get_device_type', 'train', 'cv', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 01a5f31e51b..9dace6b768c 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -432,6 +432,11 @@ def _load_pandas_categorical(file_name=None, model_str=None): return None +def get_device_type(): + """Get device type.""" + return _LIB.LGBM_GetDeviceType() + + class _InnerPredictor(object): """_InnerPredictor of LightGBM. diff --git a/python-package/setup.py b/python-package/setup.py index f66475cb27d..eca56783713 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -194,7 +194,7 @@ class CustomInstall(install): user_options = install.user_options + [ ('mingw', 'm', 'Compile with MinGW'), ('gpu', 'g', 'Compile GPU version'), - ('cuda', 'c', 'Compile CUDA version'), + ('cuda', None, 'Compile CUDA version'), ('mpi', None, 'Compile MPI version'), ('nomp', None, 'Compile version without OpenMP support'), ('hdfs', 'h', 'Compile HDFS version'), diff --git a/src/c_api.cpp b/src/c_api.cpp index f785bc74f19..03a3db597bb 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -647,6 +647,16 @@ int LGBM_RegisterLogCallback(void (*callback)(const char*)) { API_END(); } +int LGBM_GetDeviceType() { +#ifdef USE_GPU + return 1; +#elif USE_CUDA + return 2; +#else + return 0; // CPU +#endif +} + int LGBM_DatasetCreateFromFile(const char* filename, const char* parameters, const DatasetHandle reference, diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 85e9e728d70..d984c25f65f 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -29,6 +29,8 @@ def test(self): "max_bin": 255, "gpu_use_dp": True } + if lgb.get_device_type() == 2: + params["device"] = "cuda" bst = lgb.Booster(params, train_data) bst.add_valid(valid_data, "valid_1") diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index dc48fc9d3a3..37894815f4a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -61,6 +61,8 @@ def test_binary(self): 'verbose': -1, 'num_iteration': 50 # test num_iteration in dict here } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) evals_result = {} @@ -87,6 +89,8 @@ def test_rf(self): 'metric': 'binary_logloss', 'verbose': -1 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) evals_result = {} @@ -106,6 +110,8 @@ def test_regression(self): 'metric': 'l2', 'verbose': -1 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) evals_result = {} @@ -133,6 +139,8 @@ def test_missing_value_handle(self): 'verbose': -1, 'boost_from_average': False } + if lgb.get_device_type() == 2: + params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=20, @@ -188,6 +196,8 @@ def test_missing_value_handle_na(self): 'min_data_in_bin': 1, 'zero_as_missing': False } + if lgb.get_device_type() == 2: + params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -220,6 +230,8 @@ def test_missing_value_handle_zero(self): 'min_data_in_bin': 1, 'zero_as_missing': True } + if lgb.get_device_type() == 2: + params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -252,6 +264,8 @@ def test_missing_value_handle_none(self): 'min_data_in_bin': 1, 'use_missing': False } + if lgb.get_device_type() == 2: + params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -290,6 +304,8 @@ def test_categorical_handle(self): 'zero_as_missing': True, 'categorical_column': 0 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -327,6 +343,8 @@ def test_categorical_handle_na(self): 'zero_as_missing': False, 'categorical_column': 0 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -385,6 +403,8 @@ def test_multiclass(self): 'num_class': 10, 'verbose': -1 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) evals_result = {} @@ -401,6 +421,7 @@ def test_multiclass_rf(self): X, y = load_digits(10, True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { + 'device': 'cpu', 'boosting_type': 'rf', 'objective': 'multiclass', 'metric': 'multi_logloss', @@ -434,6 +455,8 @@ def test_multiclass_prediction_early_stopping(self): 'num_class': 10, 'verbose': -1 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, params=params) gbm = lgb.train(params, lgb_train, num_boost_round=50) @@ -455,6 +478,8 @@ def test_multi_class_error(self): X, y = load_digits(10, True) params = {'objective': 'multiclass', 'num_classes': 10, 'metric': 'multi_error', 'num_leaves': 4, 'verbose': -1} + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=10) predict_default = est.predict(X) @@ -564,6 +589,8 @@ def test_early_stopping(self): 'metric': 'binary_logloss', 'verbose': -1 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) @@ -597,6 +624,8 @@ def test_continue_train(self): 'metric': 'l1', 'verbose': -1 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False) init_gbm = lgb.train(params, lgb_train, num_boost_round=20) @@ -662,6 +691,8 @@ def test_continue_train_multiclass(self): 'num_class': 3, 'verbose': -1 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, params=params, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params, free_raw_data=False) init_gbm = lgb.train(params, lgb_train, num_boost_round=20) @@ -718,6 +749,8 @@ def test_cv(self): q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3} + if lgb.get_device_type() == 2: + params_lambdarank["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, group=q_train) # ... with l2 metric cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, @@ -771,6 +804,8 @@ def train_and_predict(init_model=None, return_model=False): 'metric': 'l2', 'verbose': -1 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model) return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test)) @@ -824,6 +859,8 @@ def test_pandas_categorical(self): 'metric': 'binary_logloss', 'verbose': -1 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X, y) gbm0 = lgb.train(params, lgb_train, num_boost_round=10) pred0 = gbm0.predict(X_test) @@ -920,6 +957,8 @@ def test_reference_chain(self): tmp_dat_train = tmp_dat.subset(np.arange(80)) tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18)) params = {'objective': 'regression_l2', 'metric': 'rmse'} + if lgb.get_device_type() == 2: + params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, tmp_dat_train, num_boost_round=20, valid_sets=[tmp_dat_train, tmp_dat_val], @@ -935,6 +974,8 @@ def test_contribs(self): 'metric': 'binary_logloss', 'verbose': -1, } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) gbm = lgb.train(params, lgb_train, num_boost_round=20) @@ -949,6 +990,8 @@ def train_and_get_predictions(features, labels): 'verbose': -1, 'min_data': 5, } + if lgb.get_device_type() == 2: + lgb_params["device"] = "cuda" gbm = lgb.train( params=lgb_params, train_set=dataset, @@ -1095,6 +1138,8 @@ def is_correctly_constrained(learner, x3_to_category=True): "monotone_constraints_method": monotone_constraints_method, "use_missing": False, } + if lgb.get_device_type() == 2: + params["device"] = "cuda" constrained_model = lgb.train(params, trainset) self.assertTrue(is_correctly_constrained(constrained_model, test_with_categorical_variable)) @@ -1215,6 +1260,7 @@ def test_refit(self): X, y = load_breast_cancer(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { + 'device': 'cpu', 'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1, @@ -1238,6 +1284,8 @@ def test_mape_rf(self): 'feature_fraction': 0.8, 'boost_from_average': True } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X, y) gbm = lgb.train(params, lgb_train, num_boost_round=20) pred = gbm.predict(X) @@ -1255,6 +1303,8 @@ def test_mape_dart(self): 'feature_fraction': 0.8, 'boost_from_average': False } + if lgb.get_device_type() == 2: + params["device"] = "cuda" lgb_train = lgb.Dataset(X, y) gbm = lgb.train(params, lgb_train, num_boost_round=40) pred = gbm.predict(X) @@ -1274,6 +1324,8 @@ def check_constant_features(self, y_true, expected_pred, more_params): 'min_data_in_bin': 1, 'boost_from_average': True } + if lgb.get_device_type() == 2: + params["device"] = "cuda" params.update(more_params) lgb_train = lgb.Dataset(X_train, y_train, params=params) gbm = lgb.train(params, lgb_train, num_boost_round=2) @@ -1284,6 +1336,8 @@ def test_constant_features_regression(self): params = { 'objective': 'regression' } + if lgb.get_device_type() == 2: + params["device"] = "cuda" self.check_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params) self.check_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params) self.check_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params) @@ -1292,6 +1346,8 @@ def test_constant_features_binary(self): params = { 'objective': 'binary' } + if lgb.get_device_type() == 2: + params["device"] = "cuda" self.check_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params) self.check_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params) @@ -1300,6 +1356,8 @@ def test_constant_features_multiclass(self): 'objective': 'multiclass', 'num_class': 3 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params) self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params) @@ -1308,6 +1366,8 @@ def test_constant_features_multiclassova(self): 'objective': 'multiclassova', 'num_class': 3 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params) self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params) @@ -1327,6 +1387,8 @@ def preprocess_data(dtrain, dtest, params): X, y = load_iris(True) dataset = lgb.Dataset(X, y, free_raw_data=False) params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1} + if lgb.get_device_type() == 2: + params["device"] = "cuda" results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data) self.assertIn('multi_logloss-mean', results) self.assertEqual(len(results['multi_logloss-mean']), 10) @@ -1339,14 +1401,28 @@ def test_metrics(self): evals_result = {} params_verbose = {'verbose': -1} + if lgb.get_device_type() == 2: + params_verbose["device"] = "cuda" params_obj_verbose = {'objective': 'binary', 'verbose': -1} + if lgb.get_device_type() == 2: + params_obj_verbose["device"] = "cuda" params_obj_metric_log_verbose = {'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1} + if lgb.get_device_type() == 2: + params_obj_metric_log_verbose["device"] = "cuda" params_obj_metric_err_verbose = {'objective': 'binary', 'metric': 'binary_error', 'verbose': -1} + if lgb.get_device_type() == 2: + params_obj_metric_err_verbose["device"] = "cuda" params_obj_metric_inv_verbose = {'objective': 'binary', 'metric': 'invalid_metric', 'verbose': -1} + if lgb.get_device_type() == 2: + params_obj_metric_inv_verbose["device"] = "cuda" params_obj_metric_multi_verbose = {'objective': 'binary', 'metric': ['binary_logloss', 'binary_error'], 'verbose': -1} + if lgb.get_device_type() == 2: + params_obj_metric_multi_verbose["device"] = "cuda" params_obj_metric_none_verbose = {'objective': 'binary', 'metric': 'None', 'verbose': -1} + if lgb.get_device_type() == 2: + params_obj_metric_none_verbose["device"] = "cuda" params_metric_log_verbose = {'metric': 'binary_logloss', 'verbose': -1} params_metric_err_verbose = {'metric': 'binary_error', 'verbose': -1} params_metric_inv_verbose = {'metric_types': 'invalid_metric', 'verbose': -1} @@ -1355,7 +1431,6 @@ def test_metrics(self): def get_cv_result(params=params_obj_verbose, **kwargs): return lgb.cv(params, lgb_train, num_boost_round=2, verbose_eval=False, **kwargs) - def train_booster(params=params_obj_verbose, **kwargs): lgb.train(params, lgb_train, num_boost_round=2, @@ -1564,6 +1639,8 @@ def train_booster(params=params_obj_verbose, **kwargs): # remove default metric by 'None' aliases for na_alias in ('None', 'na', 'null', 'custom'): params = {'objective': 'binary', 'metric': na_alias, 'verbose': -1} + if lgb.get_device_type() == 2: + params["device"] = "cuda" train_booster(params=params) self.assertEqual(len(evals_result), 0) @@ -1644,8 +1721,14 @@ def train_booster(params=params_obj_verbose, **kwargs): obj_multi_aliases = ['multiclass', 'softmax', 'multiclassova', 'multiclass_ova', 'ova', 'ovr'] for obj_multi_alias in obj_multi_aliases: params_obj_class_3_verbose = {'objective': obj_multi_alias, 'num_class': 3, 'verbose': -1} + if lgb.get_device_type() == 2: + params_obj_class_3_verbose["device"] = "cuda" params_obj_class_1_verbose = {'objective': obj_multi_alias, 'num_class': 1, 'verbose': -1} + if lgb.get_device_type() == 2: + params_obj_class_1_verbose["device"] = "cuda" params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1} + if lgb.get_device_type() == 2: + params_obj_verbose["device"] = "cuda" # multiclass default metric res = get_cv_result(params_obj_class_3_verbose) self.assertEqual(len(res), 2) @@ -1686,6 +1769,8 @@ def train_booster(params=params_obj_verbose, **kwargs): self.assertRaises(lgb.basic.LightGBMError, get_cv_result, params_obj_class_3_verbose, metrics='binary_logloss') params_class_3_verbose = {'num_class': 3, 'verbose': -1} + if lgb.get_device_type() == 2: + params_class_3_verbose["device"] = "cuda" # non-default num_class for default objective self.assertRaises(lgb.basic.LightGBMError, get_cv_result, params_class_3_verbose) @@ -1820,6 +1905,8 @@ def metrics_combination_train_regression(valid_sets, metric_list, assumed_iterat 'verbose': -1, 'seed': 123 } + if lgb.get_device_type() == 2: + params["device"] = "cuda" gbm = lgb.train(dict(params, first_metric_only=first_metric_only), lgb_train, num_boost_round=25, valid_sets=valid_sets, feval=feval, early_stopping_rounds=5, verbose_eval=False) From e17b345c561461f09331c1e7bb31618dc83e676c Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Thu, 2 Apr 2020 20:01:39 +0000 Subject: [PATCH 013/119] Initial CUDA work --- src/io/dense_nbits_bin.hpp | 405 -------------------------- src/treelearner/cuda_tree_learner.cpp | 1 - 2 files changed, 406 deletions(-) delete mode 100644 src/io/dense_nbits_bin.hpp diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp deleted file mode 100644 index 85ea5b311d2..00000000000 --- a/src/io/dense_nbits_bin.hpp +++ /dev/null @@ -1,405 +0,0 @@ -/*! - * Copyright (c) 2017 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_ -#define LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_ - -#include - -#include -#include -#include - -namespace LightGBM { - -class Dense4bitsBin; - -class Dense4bitsBinIterator : public BinIterator { - public: - explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) - : bin_data_(bin_data), min_bin_(static_cast(min_bin)), - max_bin_(static_cast(max_bin)), - default_bin_(static_cast(default_bin)) { - if (default_bin_ == 0) { - bias_ = 1; - } else { - bias_ = 0; - } - } - inline uint32_t RawGet(data_size_t idx) override; - inline uint32_t Get(data_size_t idx) override; - inline void Reset(data_size_t) override {} - - private: - const Dense4bitsBin* bin_data_; - uint8_t min_bin_; - uint8_t max_bin_; - uint8_t default_bin_; - uint8_t bias_; -}; - -class Dense4bitsBin : public Bin { - public: - friend Dense4bitsBinIterator; - Dense4bitsBin(data_size_t num_data) - : num_data_(num_data) { - int len = (num_data_ + 1) / 2; - data_ = std::vector(len, static_cast(0)); - buf_ = std::vector(len, static_cast(0)); - } - - ~Dense4bitsBin() { - } - - void Push(int, data_size_t idx, uint32_t value) override { - const int i1 = idx >> 1; - const int i2 = (idx & 1) << 2; - const uint8_t val = static_cast(value) << i2; - if (i2 == 0) { - data_[i1] = val; - } else { - buf_[i1] = val; - } - } - - void ReSize(data_size_t num_data) override { - if (num_data_ != num_data) { - num_data_ = num_data; - const int len = (num_data_ + 1) / 2; - data_.resize(len); - } - } - - inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override; - - void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const override { - const data_size_t rest = num_data & 0x3; - data_size_t i = 0; - for (; i < num_data - rest; i += 4) { - const data_size_t idx0 = data_indices[i]; - const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf; - - const data_size_t idx1 = data_indices[i + 1]; - const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf; - - const data_size_t idx2 = data_indices[i + 2]; - const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf; - - const data_size_t idx3 = data_indices[i + 3]; - const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf; - - out[bin0].sum_gradients += ordered_gradients[i]; - out[bin1].sum_gradients += ordered_gradients[i + 1]; - out[bin2].sum_gradients += ordered_gradients[i + 2]; - out[bin3].sum_gradients += ordered_gradients[i + 3]; - - out[bin0].sum_hessians += ordered_hessians[i]; - out[bin1].sum_hessians += ordered_hessians[i + 1]; - out[bin2].sum_hessians += ordered_hessians[i + 2]; - out[bin3].sum_hessians += ordered_hessians[i + 3]; - - ++out[bin0].cnt; - ++out[bin1].cnt; - ++out[bin2].cnt; - ++out[bin3].cnt; - } - - for (; i < num_data; ++i) { - const data_size_t idx = data_indices[i]; - const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; - } - } - - void ConstructHistogram(data_size_t num_data, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const override { - const data_size_t rest = num_data & 0x3; - data_size_t i = 0; - - for (; i < num_data - rest; i += 4) { - const auto bin0 = (data_[i >> 1]) & 0xf; - const auto bin1 = (data_[i >> 1] >> 4) & 0xf; - const auto bin2 = (data_[(i >> 1) + 1]) & 0xf; - const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf; - - out[bin0].sum_gradients += ordered_gradients[i]; - out[bin1].sum_gradients += ordered_gradients[i + 1]; - out[bin2].sum_gradients += ordered_gradients[i + 2]; - out[bin3].sum_gradients += ordered_gradients[i + 3]; - - out[bin0].sum_hessians += ordered_hessians[i]; - out[bin1].sum_hessians += ordered_hessians[i + 1]; - out[bin2].sum_hessians += ordered_hessians[i + 2]; - out[bin3].sum_hessians += ordered_hessians[i + 3]; - - ++out[bin0].cnt; - ++out[bin1].cnt; - ++out[bin2].cnt; - ++out[bin3].cnt; - } - for (; i < num_data; ++i) { - const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; - } - } - - void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, - const score_t* ordered_gradients, - hist_t* out) const override { - const data_size_t rest = num_data & 0x3; - data_size_t i = 0; - for (; i < num_data - rest; i += 4) { - const data_size_t idx0 = data_indices[i]; - const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf; - - const data_size_t idx1 = data_indices[i + 1]; - const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf; - - const data_size_t idx2 = data_indices[i + 2]; - const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf; - - const data_size_t idx3 = data_indices[i + 3]; - const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf; - - out[bin0].sum_gradients += ordered_gradients[i]; - out[bin1].sum_gradients += ordered_gradients[i + 1]; - out[bin2].sum_gradients += ordered_gradients[i + 2]; - out[bin3].sum_gradients += ordered_gradients[i + 3]; - - ++out[bin0].cnt; - ++out[bin1].cnt; - ++out[bin2].cnt; - ++out[bin3].cnt; - } - - for (; i < num_data; ++i) { - const data_size_t idx = data_indices[i]; - const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } - } - - void ConstructHistogram(data_size_t num_data, - const score_t* ordered_gradients, - hist_t* out) const override { - const data_size_t rest = num_data & 0x3; - data_size_t i = 0; - for (; i < num_data - rest; i += 4) { - const auto bin0 = (data_[i >> 1]) & 0xf; - const auto bin1 = (data_[i >> 1] >> 4) & 0xf; - const auto bin2 = (data_[(i >> 1) + 1]) & 0xf; - const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf; - - out[bin0].sum_gradients += ordered_gradients[i]; - out[bin1].sum_gradients += ordered_gradients[i + 1]; - out[bin2].sum_gradients += ordered_gradients[i + 2]; - out[bin3].sum_gradients += ordered_gradients[i + 3]; - - ++out[bin0].cnt; - ++out[bin1].cnt; - ++out[bin2].cnt; - ++out[bin3].cnt; - } - for (; i < num_data; ++i) { - const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } - } - - virtual data_size_t Split( - uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left, - uint32_t threshold, data_size_t* data_indices, data_size_t num_data, - data_size_t* lte_indices, data_size_t* gt_indices) const override { - if (num_data <= 0) { return 0; } - uint8_t th = static_cast(threshold + min_bin); - const uint8_t minb = static_cast(min_bin); - const uint8_t maxb = static_cast(max_bin); - uint8_t t_default_bin = static_cast(min_bin + default_bin); - if (default_bin == 0) { - th -= 1; - t_default_bin -= 1; - } - data_size_t lte_count = 0; - data_size_t gt_count = 0; - data_size_t* default_indices = gt_indices; - data_size_t* default_count = >_count; - if (missing_type == MissingType::NaN) { - if (default_bin <= threshold) { - default_indices = lte_indices; - default_count = <e_count; - } - data_size_t* missing_default_indices = gt_indices; - data_size_t* missing_default_count = >_count; - if (default_left) { - missing_default_indices = lte_indices; - missing_default_count = <e_count; - } - for (data_size_t i = 0; i < num_data; ++i) { - const data_size_t idx = data_indices[i]; - const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - if (bin < minb || bin > maxb || t_default_bin == bin) { - default_indices[(*default_count)++] = idx; - } else if (bin == maxb) { - missing_default_indices[(*missing_default_count)++] = idx; - } else if (bin > th) { - gt_indices[gt_count++] = idx; - } else { - lte_indices[lte_count++] = idx; - } - } - } else { - if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) { - default_indices = lte_indices; - default_count = <e_count; - } - for (data_size_t i = 0; i < num_data; ++i) { - const data_size_t idx = data_indices[i]; - const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - if (bin < minb || bin > maxb || t_default_bin == bin) { - default_indices[(*default_count)++] = idx; - } else if (bin > th) { - gt_indices[gt_count++] = idx; - } else { - lte_indices[lte_count++] = idx; - } - } - } - return lte_count; - } - - virtual data_size_t SplitCategorical( - uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, - const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data, - data_size_t* lte_indices, data_size_t* gt_indices) const override { - if (num_data <= 0) { return 0; } - data_size_t lte_count = 0; - data_size_t gt_count = 0; - data_size_t* default_indices = gt_indices; - data_size_t* default_count = >_count; - if (Common::FindInBitset(threshold, num_threahold, default_bin)) { - default_indices = lte_indices; - default_count = <e_count; - } - for (data_size_t i = 0; i < num_data; ++i) { - const data_size_t idx = data_indices[i]; - const uint32_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - if (bin < min_bin || bin > max_bin) { - default_indices[(*default_count)++] = idx; - } else if (Common::FindInBitset(threshold, num_threahold, bin - min_bin)) { - lte_indices[lte_count++] = idx; - } else { - gt_indices[gt_count++] = idx; - } - } - return lte_count; - } - - data_size_t num_data() const override { return num_data_; } - - // LGBM_CUDA - void* get_data() override { return data_.data(); } - - /*! \brief not ordered bin for dense feature */ - Bin* CreateDenseBin() const { return nullptr; } - - void FinishLoad() override { - if (buf_.empty()) { return; } - int len = (num_data_ + 1) / 2; - for (int i = 0; i < len; ++i) { - data_[i] |= buf_[i]; - } - buf_.clear(); - } - - void LoadFromMemory(const void* memory, const std::vector& local_used_indices) override { - const uint8_t* mem_data = reinterpret_cast(memory); - if (!local_used_indices.empty()) { - const data_size_t rest = num_data_ & 1; - for (int i = 0; i < num_data_ - rest; i += 2) { - // get old bins - data_size_t idx = local_used_indices[i]; - const auto bin1 = static_cast((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf); - idx = local_used_indices[i + 1]; - const auto bin2 = static_cast((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf); - // add - const int i1 = i >> 1; - data_[i1] = (bin1 | (bin2 << 4)); - } - if (rest) { - data_size_t idx = local_used_indices[num_data_ - 1]; - data_[num_data_ >> 1] = (mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - } - } else { - for (size_t i = 0; i < data_.size(); ++i) { - data_[i] = mem_data[i]; - } - } - } - - void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { - auto other_bin = dynamic_cast(full_bin); - const data_size_t rest = num_used_indices & 1; - for (int i = 0; i < num_used_indices - rest; i += 2) { - data_size_t idx = used_indices[i]; - const auto bin1 = static_cast((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf); - idx = used_indices[i + 1]; - const auto bin2 = static_cast((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf); - const int i1 = i >> 1; - data_[i1] = (bin1 | (bin2 << 4)); - } - if (rest) { - data_size_t idx = used_indices[num_used_indices - 1]; - data_[num_used_indices >> 1] = (other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - } - } - - void SaveBinaryToFile(const VirtualFileWriter* writer) const override { - writer->Write(data_.data(), sizeof(uint8_t) * data_.size()); - } - - size_t SizesInByte() const override { - return sizeof(uint8_t) * data_.size(); - } - - Dense4bitsBin* Clone() override { - return new Dense4bitsBin(*this); - } - - protected: - Dense4bitsBin(const Dense4bitsBin& other) - : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {} - - data_size_t num_data_; - std::vector data_; - std::vector buf_; -}; - -uint32_t Dense4bitsBinIterator::Get(data_size_t idx) { - const auto bin = (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - if (bin >= min_bin_ && bin <= max_bin_) { - return bin - min_bin_ + bias_; - } else { - return default_bin_; - } -} - -uint32_t Dense4bitsBinIterator::RawGet(data_size_t idx) { - return (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; -} - -inline BinIterator* Dense4bitsBin::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const { - return new Dense4bitsBinIterator(this, min_bin, max_bin, default_bin); -} - -} // namespace LightGBM -#endif // LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_ diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index d2064843ead..05c440a8e25 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -1,7 +1,6 @@ #ifdef USE_CUDA #include "cuda_tree_learner.h" #include "../io/dense_bin.hpp" -#include "../io/dense_nbits_bin.hpp" #include #include From 32825e525ca5758230275c8245b4abd9320dd89e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 2 Apr 2020 13:44:01 +0000 Subject: [PATCH 014/119] Initial CUDA work --- test_LGBM.232.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 test_LGBM.232.sh diff --git a/test_LGBM.232.sh b/test_LGBM.232.sh new file mode 100755 index 00000000000..cd5146f959f --- /dev/null +++ b/test_LGBM.232.sh @@ -0,0 +1,5 @@ +python -m unittest tests/python_package_test/test_basic.py +python -m unittest tests/python_package_test/test_consistency.py +python -m unittest tests/python_package_test/test_engine.py +python -m unittest tests/python_package_test/test_plotting.py +python -m unittest tests/python_package_test/test_sklearn.py From 169a7341147de02d1e854e4b00c6a6134a88808a Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 2 Apr 2020 18:32:22 +0000 Subject: [PATCH 015/119] Initial CUDA work --- include/LightGBM/cuda/cuda_utils.h | 16 +++------------- include/LightGBM/cuda/vector_cudahost.h | 16 +++------------- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 6d9407613f6..e57d3746a21 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -1,17 +1,7 @@ -/* - * ibmGBT: IBM CUDA Accelerated LightGBM - * - * IBM Confidential - * (C) Copyright IBM Corp. 2019. All Rights Reserved. - * - * The source code for this program is not published or otherwise - * divested of its trade secrets, irrespective of what has been - * deposited with the U.S. Copyright Office. - * - * US Government Users Restricted Rights - Use, duplication or - * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. +/*! + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #ifndef LGBM_CUDA_UTILS_H #define LGBM_CUDA_UTILS_H diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index b1a235e8a22..5159a01f030 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -1,17 +1,7 @@ -/* - * ibmGBT: IBM CUDA Accelerated LightGBM - * - * IBM Confidential - * (C) Copyright IBM Corp. 2019. All Rights Reserved. - * - * The source code for this program is not published or otherwise - * divested of its trade secrets, irrespective of what has been - * deposited with the U.S. Copyright Office. - * - * US Government Users Restricted Rights - Use, duplication or - * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. +/*! + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #ifndef LGBM_CUDA_VECTOR_CH_H #define LGBM_CUDA_VECTOR_CH_H From 3c83274fdcf80c48a579fee868112049503e8e82 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Fri, 3 Apr 2020 17:08:49 +0000 Subject: [PATCH 016/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 73 ++++++++++++++------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 05c440a8e25..df8cefd5cea 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -81,6 +81,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, } // some functions used for debugging the GPU histogram construction +#if GPU_DEBUG > 0 void PrintHistograms(hist_t* h, size_t size) { size_t total = 0; @@ -103,7 +104,7 @@ union Float_t }; -void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) { +void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) { size_t i; Float_t a, b; @@ -137,6 +138,8 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) { Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i); } +#endif + int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples. @@ -230,6 +233,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) { // when the output is ready, the computation is done CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id])); + // LGBM_CUDA } #pragma omp parallel for schedule(static) @@ -242,12 +246,10 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) { int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); for (int j = 0; j < bin_size; ++j) { - old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients; - old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians; - old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt; + GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j); + GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j); } } - } // LGBM_CUDA @@ -256,7 +258,7 @@ void CUDATreeLearner::CountDenseFeatureGroups() { num_dense_feature_groups_ = 0; for (int i = 0; i < num_feature_groups_; ++i) { - if (ordered_bins_[i] == nullptr) { + if (!train_data_->IsMultiGroup(i)) { num_dense_feature_groups_++; } } @@ -435,7 +437,7 @@ void CUDATreeLearner::copyDenseFeature() { for (int i = 0; i < num_feature_groups_; ++i) { // looking for dword_features_ non-sparse feature-groups - if (ordered_bins_[i] == nullptr) { + if (!train_data_->IsMultiGroup(i)) { dense_feature_group_map_.push_back(i); auto sizes_in_byte = train_data_->FeatureGroupSizesInByte(i); void* tmp_data = train_data_->FeatureGroupData(i); @@ -548,6 +550,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { // for debuging kernel_time_.resize(num_gpu_, 0); kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); + //kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); for(int i = 0; i < num_gpu_; ++i) { CUDASUCCESS_OR_FATAL(cudaSetDevice(i)); @@ -865,7 +868,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) { - //LGBM_CUDA + // LGBM_CUDA auto start_time = std::chrono::steady_clock::now(); std::vector is_sparse_feature_used(num_features_, 0); @@ -875,9 +878,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ #pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { - if (!is_feature_used_[feature_index]) continue; if (!is_feature_used[feature_index]) continue; - if (ordered_bins_[train_data_->Feature2Group(feature_index)]) { + if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) { is_sparse_feature_used[feature_index] = 1; num_sparse_features++; } @@ -905,12 +907,19 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // then construct sparse features on CPU // We set data_indices to null to avoid rebuilding ordered gradients/hessians if (num_sparse_features > 0){ +// train_data_->ConstructHistograms(is_sparse_feature_used, +// nullptr, smaller_leaf_splits_->num_data_in_leaf(), +// smaller_leaf_splits_->leaf_index(), +// ordered_bins_, gradients_, hessians_, +// ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, +// ptr_smaller_leaf_hist_data); train_data_->ConstructHistograms(is_sparse_feature_used, - nullptr, smaller_leaf_splits_->num_data_in_leaf(), - smaller_leaf_splits_->LeafIndex(), - ordered_bins_, gradients_, hessians_, - ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, + smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), + gradients_, hessians_, + ordered_gradients_.data(), ordered_hessians_.data(), + share_state_.get(), ptr_smaller_leaf_hist_data); + } // wait for GPU to finish, only if GPU is actually used @@ -935,13 +944,13 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ continue; int dense_feature_group_index = dense_feature_group_map_[i]; size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index); - hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; - hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index); - hist_t* gpu_histogram = new hist_t[size]; + HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; + HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index); + HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size]; data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); std::copy(current_histogram, current_histogram + size, gpu_histogram); - std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(hist_t)); + std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry)); if ( num_data == num_data_ ) { if ( is_constant_hessian_ ) { printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_"); @@ -999,12 +1008,18 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // We set data_indices to null to avoid rebuilding ordered gradients/hessians if (num_sparse_features > 0){ + //train_data_->ConstructHistograms(is_sparse_feature_used, + // nullptr, larger_leaf_splits_->num_data_in_leaf(), + // larger_leaf_splits_->leaf_index(), + // ordered_bins_, gradients_, hessians_, + // ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, + // ptr_larger_leaf_hist_data); train_data_->ConstructHistograms(is_sparse_feature_used, - nullptr, larger_leaf_splits_->num_data_in_leaf(), - larger_leaf_splits_->LeafIndex(), - ordered_bins_, gradients_, hessians_, - ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, - ptr_larger_leaf_hist_data); + smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), + gradients_, hessians_, + ordered_gradients_.data(), ordered_hessians_.data(), + share_state_.get(), + ptr_smaller_leaf_hist_data); } // wait for GPU to finish, only if GPU is actually used @@ -1037,7 +1052,7 @@ void CUDATreeLearner::FindBestSplits() { size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d smaller leaf:\n", feature_index, bin_size); PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); - if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; } + if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; } printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d larger leaf:\n", feature_index, bin_size); PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); @@ -1047,13 +1062,10 @@ void CUDATreeLearner::FindBestSplits() { void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) { const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf]; - #if GPU_DEBUG >= 2 printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian); #endif - SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf); - if (Network::num_machines() == 1) { // do some sanity check for the GPU algorithm if (best_split_info.left_count < best_split_info.right_count) { @@ -1062,21 +1074,14 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); } } else { - double smaller_min = smaller_leaf_splits_->min_constraint(); - double smaller_max = smaller_leaf_splits_->max_constraint(); - double larger_min = larger_leaf_splits_->min_constraint(); - double larger_max = larger_leaf_splits_->max_constraint(); smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); - smaller_leaf_splits_->SetValueConstraint(smaller_min, smaller_max); - larger_leaf_splits_->SetValueConstraint(larger_min, larger_max); if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) || (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) { Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); } } } - } } // namespace LightGBM From 5b3f36acf6a7bbb7deb3fe14776ae3d12d7f90a0 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 6 Apr 2020 02:11:44 +0000 Subject: [PATCH 017/119] Initial CUDA work --- src/boosting/gbdt.cpp | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 580c52fd889..06af0fcb1f1 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -93,20 +93,6 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective is_constant_hessian_ = GetIsConstHessian(objective_function); - tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get())); - - // init tree learner - // LGBM_CUDA do not copy feature is is_use_subset for initialization - // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_) - - if (config_->device_type == std::string("cuda")) { - tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_); - } else { - tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_); - } - - tree_learner_->SetForcedSplit(&forced_splits_json_); - // push training metrics training_metrics_.clear(); for (const auto& metric : training_metrics) { @@ -132,24 +118,31 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective feature_infos_ = train_data_->feature_infos(); monotone_constraints_ = config->monotone_constraints; - // if need bagging, create buffer - // LGBM_CUDA: for CUDA implementation, this function sets up the is_use_subset_ flag as TRUE and tmp_subset_ is allocated. - ResetBaggingConfig(config_.get(), true); - // LGBM_CUDA // Two key changes: position of the initializer is moved from the original code, and init() uses is_use_subset_ flag tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get())); + // if need bagging, create buffer + // LGBM_CUDA: for CUDA implementation, this function sets up the is_use_subset_ flag as TRUE and tmp_subset_ is allocated. + ResetBaggingConfig(config_.get(), true); + // init tree learner // LGBM_CUDA do not copy feature is is_use_subset for initialization // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_) if (config_->device_type == std::string("cuda")) { - tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_); + if (is_use_subset_) { + tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_); + } + else { + tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_); + } } else { tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_); } + tree_learner_->SetForcedSplit(&forced_splits_json_); + class_need_train_ = std::vector(num_tree_per_iteration_, true); if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) { CHECK_EQ(num_tree_per_iteration_, num_class_); From 91a312fab265253e198dd7d9f90ecc48d52ca439 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 7 Apr 2020 14:52:32 +0000 Subject: [PATCH 018/119] Initial CUDA work --- tests/python_package_test/test_engine.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 37894815f4a..51ab32a239b 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1138,8 +1138,6 @@ def is_correctly_constrained(learner, x3_to_category=True): "monotone_constraints_method": monotone_constraints_method, "use_missing": False, } - if lgb.get_device_type() == 2: - params["device"] = "cuda" constrained_model = lgb.train(params, trainset) self.assertTrue(is_correctly_constrained(constrained_model, test_with_categorical_variable)) From b05afeb5754c54217722477c57eea052371bef56 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 15 Apr 2020 13:56:31 +0000 Subject: [PATCH 019/119] Initial CUDA work --- include/LightGBM/cuda/vector_cudahost.h | 1 + src/boosting/gbdt.cpp | 8 +++++++- src/io/config.cpp | 2 +- src/treelearner/cuda_tree_learner.cpp | 25 +++++++++++-------------- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 5159a01f030..d95a353c246 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -43,6 +43,7 @@ struct CHAllocator { if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){ cudaError_t ret= cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); if (ret != cudaSuccess){ +fprintf(stderr, " TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr); ptr = (T*) malloc(n*sizeof(T)); } } diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 06af0fcb1f1..02f659e9d83 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -425,6 +425,12 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { auto hess = hessians + offset; // LGBM_CUDA + if (((tmp_gradients_.data() == 0) || (tmp_hessians_.data() == 0)) && (config_->device_type == std::string("cuda"))) { + size_t bag_gh_size = static_cast(bag_data_cnt_) * num_tree_per_iteration_; + tmp_gradients_.resize(bag_gh_size); + tmp_hessians_.resize(bag_gh_size); + } + auto tmp_grad = tmp_gradients_.data(); auto tmp_hess = tmp_hessians_.data(); @@ -436,7 +442,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { tmp_grad[i] = grad[bag_data_indices_[i]]; // LGBM_CUDA tmp_hess[i] = hess[bag_data_indices_[i]]; // LGBM_CUDA } - } + } // LGBM_CUDA new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_)); diff --git a/src/io/config.cpp b/src/io/config.cpp index 5d2faba6133..963ef084578 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -322,7 +322,7 @@ void Config::CheckParamConflict() { } } // force col-wise for gpu - if (device_type == std::string("gpu")) { // GCF maybe need to add some cuda here? + if (device_type == std::string("gpu")) { force_col_wise = true; force_row_wise = false; } diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index df8cefd5cea..6ec4bf8b359 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -63,7 +63,6 @@ CUDATreeLearner::~CUDATreeLearner() { void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { - // initialize SerialTreeLearner SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset); @@ -212,13 +211,9 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id])); - - CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id])); } - } @@ -337,7 +332,6 @@ void CUDATreeLearner::prevAllocateGPUMemory() { // LGBM_CUDA: allocate GPU memory for each GPU void CUDATreeLearner::AllocateGPUMemory() { - #pragma omp parallel for schedule(static, num_gpu_) for(int device_id = 0; device_id < num_gpu_; ++device_id) { @@ -400,7 +394,7 @@ void CUDATreeLearner::AllocateGPUMemory() { // create atomic counters for inter-group coordination CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id])); CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int))); - CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int))); + CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int), stream_[device_id])); // The output buffer is allocated to host directly, to overlap compute and data transfer CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id])); @@ -452,7 +446,7 @@ void CUDATreeLearner::copyDenseFeature() { copied_feature = 0; if(device_id < num_gpu_) { device_features = device_features_[device_id]; - //CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); } } } @@ -726,18 +720,22 @@ void CUDATreeLearner::BeforeTrain() { if (!is_constant_hessian_) { - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); +void *foo = malloc(num_data_ * sizeof(score_t)); +memcpy(foo, &(hessians_[0]), num_data_ * sizeof(score_t)); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], foo, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); +free(foo); CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); } - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); +void *foo = malloc(num_data_ * sizeof(score_t)); +memcpy(foo, &(gradients_[0]), num_data_ * sizeof(score_t)); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], foo, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); +free(foo); CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); } - } } - } bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { @@ -855,8 +853,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( //#pragma omp parallel for schedule(static, num_gpu_) for(int device_id = 0; device_id < num_gpu_; ++device_id) { int offset = offset_gpu_feature_groups_[device_id]; - CUDASUCCESS_OR_FATAL(cudaMemcpy(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice)); - //CUDASUCCESS_OR_FATAL(cudaMemcpy(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id])); } // All data have been prepared, now run the GPU kernel From 013631787a246e8177e21f3279ab0c88609c1f39 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Thu, 16 Apr 2020 19:40:26 +0000 Subject: [PATCH 020/119] Initial CUDA work --- src/boosting/gbdt.cpp | 18 +++++++++--------- src/treelearner/cuda_tree_learner.cpp | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 02f659e9d83..b2fb636ac4e 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -270,9 +270,9 @@ void GBDT::Bagging(int iter) { bool resized= tmp_subset_->ReSize(bag_data_cnt_); if (resized && (config_->device_type == std::string("cuda"))) { - size_t bag_gh_size = static_cast(bag_data_cnt_) * num_tree_per_iteration_; - tmp_gradients_.resize(bag_gh_size); - tmp_hessians_.resize(bag_gh_size); + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + tmp_gradients_.resize(total_size); + tmp_hessians_.resize(total_size); } tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); @@ -426,9 +426,9 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { // LGBM_CUDA if (((tmp_gradients_.data() == 0) || (tmp_hessians_.data() == 0)) && (config_->device_type == std::string("cuda"))) { - size_t bag_gh_size = static_cast(bag_data_cnt_) * num_tree_per_iteration_; - tmp_gradients_.resize(bag_gh_size); - tmp_hessians_.resize(bag_gh_size); + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + tmp_gradients_.resize(total_size); + tmp_hessians_.resize(total_size); } auto tmp_grad = tmp_gradients_.data(); @@ -971,9 +971,9 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { if (tmp_subset_ == nullptr){ tmp_subset_.reset(new Dataset(bag_data_cnt_)); tmp_subset_->CopyFeatureMapperFrom(train_data_); - size_t bag_gh_size = static_cast(bag_data_cnt_) * num_tree_per_iteration_; - tmp_gradients_.resize(bag_gh_size); - tmp_hessians_.resize(bag_gh_size); + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + tmp_gradients_.resize(total_size); + tmp_hessians_.resize(total_size); is_use_subset_ = false; bag_data_indices_.clear(); } diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 6ec4bf8b359..f2ca4717598 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -317,7 +317,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_); // histogram bin entry size depends on the precision (single/double) - hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t); + hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t)); // two elements in this "size" // host_size histogram outputs // host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_); From f4b10571bb539103e0d9711c918c053af0ec211f Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 20 Apr 2020 14:59:54 +0000 Subject: [PATCH 021/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index f2ca4717598..e789462fe13 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -720,18 +720,12 @@ void CUDATreeLearner::BeforeTrain() { if (!is_constant_hessian_) { -void *foo = malloc(num_data_ * sizeof(score_t)); -memcpy(foo, &(hessians_[0]), num_data_ * sizeof(score_t)); - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], foo, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); -free(foo); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], (void *) &(hessians_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); } -void *foo = malloc(num_data_ * sizeof(score_t)); -memcpy(foo, &(gradients_[0]), num_data_ * sizeof(score_t)); - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], foo, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); -free(foo); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], (void *) &(gradients_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); } } From 82e496877456ca2a57d5592448d96e3952b32c2b Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 20 Apr 2020 17:27:15 +0000 Subject: [PATCH 022/119] Initial CUDA work --- CMakeLists.txt | 2 +- src/treelearner/cuda_kernel_launcher.h | 2 +- .../kernels/{histogram256.cu => histogram_16_64_256.cu} | 2 +- .../kernels/{histogram256.hu => histogram_16_64_256.hu} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename src/treelearner/kernels/{histogram256.cu => histogram_16_64_256.cu} (99%) rename src/treelearner/kernels/{histogram256.hu => histogram_16_64_256.hu} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc87f65bd67..7bcd068f3ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -194,7 +194,7 @@ if(USE_CUDA) endfunction() #foreach (hsize 16 64 256) - foreach (hsize 256) + foreach (hsize _16_64_256) add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}") add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}") add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}") diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h index ae7d3498e83..f63cfa97a08 100644 --- a/src/treelearner/cuda_kernel_launcher.h +++ b/src/treelearner/cuda_kernel_launcher.h @@ -3,7 +3,7 @@ #ifdef USE_CUDA // what should I include?? -#include "kernels/histogram256.hu" // kernel, acc_type, data_size_t, uchar, score_t +#include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t #include struct ThreadData { diff --git a/src/treelearner/kernels/histogram256.cu b/src/treelearner/kernels/histogram_16_64_256.cu similarity index 99% rename from src/treelearner/kernels/histogram256.cu rename to src/treelearner/kernels/histogram_16_64_256.cu index 5d659f8e2cf..08195d855de 100644 --- a/src/treelearner/kernels/histogram256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -12,7 +12,7 @@ * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. */ -#include "histogram256.hu" +#include "histogram_16_64_256.hu" #include "stdio.h" #define PRINT(b,t,fmt,...) \ diff --git a/src/treelearner/kernels/histogram256.hu b/src/treelearner/kernels/histogram_16_64_256.hu similarity index 100% rename from src/treelearner/kernels/histogram256.hu rename to src/treelearner/kernels/histogram_16_64_256.hu From 512a0a3600e81991a2b19db759bcb8964748100c Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 21 Apr 2020 14:06:54 +0000 Subject: [PATCH 023/119] Initial CUDA work --- src/treelearner/cuda_kernel_launcher.cu | 292 ++++---- src/treelearner/cuda_kernel_launcher.h | 2 + src/treelearner/cuda_tree_learner.cpp | 9 +- src/treelearner/cuda_tree_learner.h | 4 +- .../kernels/histogram_16_64_256.cu | 707 +++++++++++++++++- .../kernels/histogram_16_64_256.hu | 21 +- 6 files changed, 877 insertions(+), 158 deletions(-) diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu index d084abe4f23..4906ca7e02d 100644 --- a/src/treelearner/cuda_kernel_launcher.cu +++ b/src/treelearner/cuda_kernel_launcher.cu @@ -1,13 +1,14 @@ -#ifdef USE_CUDA - -#include "cuda_kernel_launcher.h" -#include -#include -#include - -using namespace LightGBM; - -void cuda_histogram( + #ifdef USE_CUDA + + #include "cuda_kernel_launcher.h" + #include + #include + #include + + using namespace LightGBM; + + void cuda_histogram( + int histogram_size, data_size_t leaf_num_data, data_size_t num_data, bool use_all_features, @@ -26,141 +27,148 @@ void cuda_histogram( volatile int* arg8, void* arg9, size_t exp_workgroups_per_feature) { - - - if (leaf_num_data == num_data) { - - if (use_all_features){ - if (!is_constant_hessian) { - histogram256<<>>( - arg0, - arg1, - arg2, - reinterpret_cast(arg3), - arg4, - arg5, - static_cast(arg6), - arg7, - arg8, - static_cast(arg9), - exp_workgroups_per_feature - ); - } - else { - histogram256<<>>( - arg0, - arg1, - arg2, - reinterpret_cast(arg3), - arg4, - arg5, - arg6_const, - arg7, - arg8, - static_cast(arg9), - exp_workgroups_per_feature - ); - } + + if (histogram_size == 16) { + if (leaf_num_data == num_data) { + if (use_all_features) { + if (!is_constant_hessian) + histogram16<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram16<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + else { + if (!is_constant_hessian) + histogram16_fulldata<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram16_fulldata<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + } + else { + if (use_all_features) { + // seems all features is always enabled, so this should be the same as fulldata + if (!is_constant_hessian) + histogram16<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram16<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + else { + if (!is_constant_hessian) + histogram16<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram16<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + } } - else{ - if (!is_constant_hessian) { - histogram256_fulldata<<>>( - arg0, - arg1, - arg2, - reinterpret_cast(arg3), - arg4, - arg5, - static_cast(arg6), - arg7, - arg8, - static_cast(arg9), - exp_workgroups_per_feature); - } - else { - histogram256_fulldata<<>>( - arg0, - arg1, - arg2, - reinterpret_cast(arg3), - arg4, - arg5, - arg6_const, - arg7, - arg8, - static_cast(arg9), - exp_workgroups_per_feature); - } - } - } - else { - if (use_all_features) { - // seems all features is always enabled, so this should be the same as fulldata - if (!is_constant_hessian) { - - histogram256<<>>( - arg0, - arg1, - arg2, - reinterpret_cast(arg3), - arg4, - arg5, - static_cast(arg6), - arg7, - arg8, - static_cast(arg9), - exp_workgroups_per_feature - ); - } - else { - histogram256<<>>( - arg0, - arg1, - arg2, - reinterpret_cast(arg3), - arg4, - arg5, - arg6_const, - arg7, - arg8, - static_cast(arg9), - exp_workgroups_per_feature - ); - } + else if (histogram_size == 64) { + if (leaf_num_data == num_data) { + if (use_all_features) { + if (!is_constant_hessian) + histogram64<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram64<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + else { + if (!is_constant_hessian) + histogram64_fulldata<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram64_fulldata<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + } + else { + if (use_all_features) { + // seems all features is always enabled, so this should be the same as fulldata + if (!is_constant_hessian) + histogram64<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram64<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + else { + if (!is_constant_hessian) + histogram64<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram64<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + } } else { - if (!is_constant_hessian) { - histogram256<<>>( - arg0, - arg1, - arg2, - reinterpret_cast(arg3), - arg4, - arg5, - static_cast(arg6), - arg7, - arg8, - static_cast(arg9), - exp_workgroups_per_feature - ); - } - else { - histogram256<<>>( - arg0, - arg1, - arg2, - reinterpret_cast(arg3), - arg4, - arg5, - arg6_const, - arg7, - arg8, - static_cast(arg9), - exp_workgroups_per_feature - ); - } + if (leaf_num_data == num_data) { + if (use_all_features) { + if (!is_constant_hessian) + histogram256<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram256<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + else { + if (!is_constant_hessian) + histogram256_fulldata<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram256_fulldata<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + } + else { + if (use_all_features) { + // seems all features is always enabled, so this should be the same as fulldata + if (!is_constant_hessian) + histogram256<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram256<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + else { + if (!is_constant_hessian) + histogram256<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + else + histogram256<<>>( arg0, arg1, arg2, + reinterpret_cast(arg3), arg4, arg5, + arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + } + } } - } } - + #endif // USE_CUDA diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h index f63cfa97a08..1241a9cafb9 100644 --- a/src/treelearner/cuda_kernel_launcher.h +++ b/src/treelearner/cuda_kernel_launcher.h @@ -10,6 +10,7 @@ struct ThreadData { // device id int device_id; // parameters for cuda_histogram + int histogram_size; data_size_t leaf_num_data; data_size_t num_data; bool use_all_features; @@ -40,6 +41,7 @@ struct ThreadData { void cuda_histogram( + int histogram_size, data_size_t leaf_num_data, data_size_t num_data, bool use_all_features, diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index e789462fe13..0e630389490 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -24,7 +24,8 @@ static void *launch_cuda_histogram(void *thread_data) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); // launch cuda kernel - cuda_histogram(td.leaf_num_data, td.num_data, td.use_all_features, + cuda_histogram(td.histogram_size, + td.leaf_num_data, td.num_data, td.use_all_features, td.is_constant_hessian, td.num_workgroups, td.stream, td.device_features, td.device_feature_masks, @@ -183,7 +184,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_); } //set thread_data - SetThreadData(thread_data, device_id, leaf_num_data, use_all_features, + SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features, num_workgroups, exp_workgroups_per_feature); } @@ -479,17 +480,21 @@ void CUDATreeLearner::InitGPU(int num_gpu) { max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i)); } + // GCF XXX: resolving device_bin_size_ and histogram_size_ is the remaining work if (max_num_bin_ <= 16) { device_bin_size_ = 256; //LGBM_CUDA + histogram_size_ = 16; dword_features_ = 1; // LGBM_CUDA } else if (max_num_bin_ <= 64) { device_bin_size_ = 256; //LGBM_CUDA + histogram_size_ = 64; dword_features_ = 1; // LGBM_CUDA } else if ( max_num_bin_ <= 256) { Log::Debug("device_bin_size_ = 256"); device_bin_size_ = 256; + histogram_size_ = 256; dword_features_ = 1; // LGBM_CUDA } else { diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 93bbdc483b7..a84d6b6662f 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -109,11 +109,12 @@ class CUDATreeLearner: public SerialTreeLearner { */ void GPUHistogram(data_size_t leaf_num_data, bool use_all_features); - void SetThreadData(ThreadData* thread_data, int device_id, + void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size, int leaf_num_data, bool use_all_features, int num_workgroups, int exp_workgroups_per_feature) { ThreadData* td = &thread_data[device_id]; td->device_id = device_id; + td->histogram_size = histogram_size; td->leaf_num_data = leaf_num_data; td->num_data = num_data_; td->use_all_features = use_all_features; @@ -208,6 +209,7 @@ class CUDATreeLearner: public SerialTreeLearner { * which GPU kernel to use */ int max_num_bin_; /*! \brief Used GPU kernel bin size (64, 256) */ + int histogram_size_; int device_bin_size_; /*! \brief Size of histogram bin entry, depending if single or double precision is used */ size_t hist_bin_entry_sz_; diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 08195d855de..33761a2c2a4 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -20,7 +20,703 @@ if (b == gtid && t == ltid) { \ printf(fmt, __VA_ARGS__); \ } +// atomic add for float number in local memory +inline __device__ void atomic_local_add_f(acc_type *addr, const float val) +{ + atomicAdd(addr, static_cast(val)); +} + +// histogram16 stuff +#ifdef ENABLE_ALL_FEATURES +#ifdef IGNORE_INDICES +#define KERNEL_NAME histogram16_fulldata +#else // IGNORE_INDICES +#define KERNEL_NAME histogram16 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled +//#define KERNEL_NAME histogram16_allfeats +#endif // IGNORE_INDICES +#else // ENABLE_ALL_FEATURES +#error "ENABLE_ALL_FEATURES should always be 1" +#define KERNEL_NAME histogram16 +#endif // ENABLE_ALL_FEATURES + +// this function will be called by histogram16 +// we have one sub-histogram of one feature in local memory, and need to read others +inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist, + const uint skip_id, + const uint old_val_cont_bin0, + const ushort num_sub_hist, + acc_type* __restrict__ output_buf, + acc_type* __restrict__ local_hist, + const size_t power_feature_workgroups) { + const ushort ltid = threadIdx.x; + // TODO: try to avoid bank conflict here + acc_type grad_bin = local_hist[ltid * 2]; + acc_type hess_bin = local_hist[ltid * 2 + 1]; + uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + + uint cont_bin; + if (power_feature_workgroups != 0) { + cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; + } else { + cont_bin = local_cnt[ltid]; + } + ushort i; + + if (power_feature_workgroups != 0) { + // add all sub-histograms for feature + const acc_type* __restrict__ p = feature_sub_hist + ltid; + for (i = 0; i < skip_id; ++i) { + grad_bin += *p; p += NUM_BINS; + hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; + } + + // skip the counters we already have + p += 3 * NUM_BINS; + + for (i = i + 1; i < num_sub_hist; ++i) { + grad_bin += *p; p += NUM_BINS; + hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; + } + } + __syncthreads(); + + + output_buf[ltid * 3 + 0] = grad_bin; + output_buf[ltid * 3 + 1] = hess_bin; + output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); +} + +#if USE_CONSTANT_BUF == 1 +__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, + __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))), + const data_size_t feature_size, + __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), + const data_size_t num_data, + __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), +#if CONST_HESSIAN == 0 + __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))), +#else + const score_t const_hessian, +#endif + char* __restrict__ output_buf, + volatile int * sync_counters, + acc_type* __restrict__ hist_buf_base, + const size_t power_feature_workgroups) { +#else +__global__ void KERNEL_NAME(const uchar* feature_data_base, + // FIXME: how to handle this __constant + const uchar* __restrict__ feature_masks, + const data_size_t feature_size, + const data_size_t* data_indices, + const data_size_t num_data, + const score_t* ordered_gradients, +#if CONST_HESSIAN == 0 + const score_t* ordered_hessians, +#else + const score_t const_hessian, +#endif + char* __restrict__ output_buf, + volatile int * sync_counters, + acc_type* __restrict__ hist_buf_base, + const size_t power_feature_workgroups) { +#endif + // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms + // otherwise a "Misaligned Address" exception may occur + __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; + const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; + const ushort ltid = threadIdx.x; + const ushort lsize = NUM_BINS; // get_local_size(0); + const ushort group_id = blockIdx.x; + + // local memory per workgroup is 3 KB + // clear local memory + uint *ptr = (uint *) shared_array; + for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { + ptr[i] = 0; + } + __syncthreads(); + // gradient/hessian histograms + // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? + // total size: 2 * 256 * size_of(float) = 2 KB + // organization: each feature/grad/hessian is at a different bank, + // as indepedent of the feature value as possible + acc_type *gh_hist = (acc_type *)shared_array; + + // counter histogram + // total size: 256 * size_of(uint) = 1 KB + uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); + + // odd threads (1, 3, ...) compute histograms for hessians first + // even thread (0, 2, ...) compute histograms for gradients first + // etc. + uchar is_hessian_first = ltid & 1; + + ushort feature_id = group_id >> power_feature_workgroups; + + // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) + // feature_size is the number of examples per feature + const uchar *feature_data = feature_data_base + feature_id * feature_size; + + // size of threads that process this feature4 + const uint subglobal_size = lsize * (1 << power_feature_workgroups); + + // equavalent thread ID in this subgroup for this feature4 + const uint subglobal_tid = gtid - feature_id * subglobal_size; + + + data_size_t ind; + data_size_t ind_next; + #ifdef IGNORE_INDICES + ind = subglobal_tid; + #else + ind = data_indices[subglobal_tid]; + #endif + + // extract feature mask, when a byte is set to 0, that feature is disabled + uchar feature_mask = feature_masks[feature_id]; + // exit if the feature is masked + if (!feature_mask) { + return; + } else { + feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + } + + // STAGE 1: read feature data, and gradient and hessian + // first half of the threads read feature data from global memory + // We will prefetch data into the "next" variable at the beginning of each iteration + uchar feature; + uchar feature_next; + //uint8_t bin; + ushort bin; + + feature = feature_data[ind >> feature_mask]; + if (feature_mask) { + feature = (feature >> ((ind & 1) << 2)) & 0xf; + } + bin = feature; + acc_type grad_bin = 0.0f, hess_bin = 0.0f; + acc_type *addr_bin; + + // store gradient and hessian + score_t grad, hess; + score_t grad_next, hess_next; + // LGBM_CUDA v5.1 + grad = ordered_gradients[ind]; + #if CONST_HESSIAN == 0 + hess = ordered_hessians[ind]; + #endif + + + // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 + for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { + // prefetch the next iteration variables + // we don't need bondary check because we have made the buffer large + #ifdef IGNORE_INDICES + // we need to check to bounds here + ind_next = i + subglobal_size < num_data ? i + subglobal_size : i; + #else + ind_next = data_indices[i + subglobal_size]; + #endif + + // imbGBT v5.1 + grad_next = ordered_gradients[ind_next]; + #if CONST_HESSIAN == 0 + hess_next = ordered_hessians[ind_next]; + #endif + + // STAGE 2: accumulate gradient and hessian + if (bin != feature) { + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + + bin = feature; + grad_bin = grad; + hess_bin = hess; + } + else { + grad_bin += grad; + hess_bin += hess; + } + + // prefetch the next iteration variables + feature_next = feature_data[ind_next >> feature_mask]; + + // STAGE 3: accumulate counter + atomicAdd(cnt_hist + feature, 1); + + // STAGE 4: update next stat + grad = grad_next; + hess = hess_next; + // LGBM_CUDA: v4.2 + if (!feature_mask) { + feature = feature_next; + } else { + feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; + } + } + + + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + __syncthreads(); + + #if CONST_HESSIAN == 1 + // make a final reduction + gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + __syncthreads(); + #endif + +#if POWER_FEATURE_WORKGROUPS != 0 + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; + // write gradients and hessians + acc_type *__restrict__ ptr_f = output; + for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { + // even threads read gradients, odd threads read hessians + // FIXME: 2-way bank conflict + acc_type value = gh_hist[i]; + ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; + } + // write counts + acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } + // FIXME: is this right + __syncthreads(); + __threadfence(); + // To avoid the cost of an extra reducting kernel, we have to deal with some + // gray area in OpenCL. We want the last work group that process this feature to + // make the final reduction, and other threads will just quit. + // This requires that the results written by other workgroups available to the + // last workgroup (memory consistency) + #if NVIDIA == 1 + // this is equavalent to CUDA __threadfence(); + // ensure the writes above goes to main memory and other workgroups can see it + asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); + #else + // FIXME: how to do the above on AMD GPUs?? + // GCN ISA says that the all writes will bypass L1 cache (write through), + // however when the last thread is reading sub-histogram data we have to + // make sure that no part of data is modified in local L1 cache of other workgroups. + // Otherwise reading can be a problem (atomic operations to get consistency). + // But in our case, the sub-histogram of this workgroup cannot be in the cache + // of another workgroup, so the following trick will work just fine. + #endif + // Now, we want one workgroup to do the final reduction. + // Other workgroups processing the same feature quit. + // The is done by using an global atomic counter. + // On AMD GPUs ideally this should be done in GDS, + // but currently there is no easy way to access it via OpenCL. + uint * counter_val = cnt_hist; + // backup the old value + uint old_val = *counter_val; + if (ltid == 0) { + // all workgroups processing the same feature add this counter + *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); + } + // make sure everyone in this workgroup is here + __syncthreads(); + // everyone in this wrokgroup: if we are the last workgroup, then do reduction! + if (*counter_val == (1 << power_feature_workgroups) - 1) { + if (ltid == 0) { + sync_counters[feature_id] = 0; + } + //} + #else + } + // only 1 work group, no need to increase counter + // the reduction will become a simple copy + if (1) { + uint old_val; // dummy + #endif + // locate our feature's block in output memory + uint output_offset = (feature_id << power_feature_workgroups); + acc_type const * __restrict__ feature_subhists = + (acc_type *)output_buf + output_offset * 3 * NUM_BINS; + // skip reading the data already in local memory + //uint skip_id = feature_id ^ output_offset; + uint skip_id = group_id - output_offset; + // locate output histogram location for this feature4 + acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS; + + + within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); + } +} + +// end of histogram16 stuff + +// histogram64 stuff +#ifdef ENABLE_ALL_FEATURES +#ifdef IGNORE_INDICES +#define KERNEL_NAME histogram64_fulldata +#else // IGNORE_INDICES +#define KERNEL_NAME histogram64 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled +//#define KERNEL_NAME histogram64_allfeats +#endif // IGNORE_INDICES +#else // ENABLE_ALL_FEATURES +#error "ENABLE_ALL_FEATURES should always be 1" +#define KERNEL_NAME histogram64 +#endif // ENABLE_ALL_FEATURES + +// this function will be called by histogram64 +// we have one sub-histogram of one feature in local memory, and need to read others +inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist, + const uint skip_id, + const uint old_val_cont_bin0, + const ushort num_sub_hist, + acc_type* __restrict__ output_buf, + acc_type* __restrict__ local_hist, + const size_t power_feature_workgroups) { + const ushort ltid = threadIdx.x; + // TODO: try to avoid bank conflict here + acc_type grad_bin = local_hist[ltid * 2]; + acc_type hess_bin = local_hist[ltid * 2 + 1]; + uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + + uint cont_bin; + if (power_feature_workgroups != 0) { + cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; + } else { + cont_bin = local_cnt[ltid]; + } + ushort i; + + if (power_feature_workgroups != 0) { + // add all sub-histograms for feature + const acc_type* __restrict__ p = feature_sub_hist + ltid; + for (i = 0; i < skip_id; ++i) { + grad_bin += *p; p += NUM_BINS; + hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; + } + + // skip the counters we already have + p += 3 * NUM_BINS; + + for (i = i + 1; i < num_sub_hist; ++i) { + grad_bin += *p; p += NUM_BINS; + hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; + } + } + __syncthreads(); + + + output_buf[ltid * 3 + 0] = grad_bin; + output_buf[ltid * 3 + 1] = hess_bin; + output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); +} + +#if USE_CONSTANT_BUF == 1 +__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, + __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))), + const data_size_t feature_size, + __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), + const data_size_t num_data, + __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), +#if CONST_HESSIAN == 0 + __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))), +#else + const score_t const_hessian, +#endif + char* __restrict__ output_buf, + volatile int * sync_counters, + acc_type* __restrict__ hist_buf_base, + const size_t power_feature_workgroups) { +#else +__global__ void KERNEL_NAME(const uchar* feature_data_base, + // FIXME: how to handle this __constant + const uchar* __restrict__ feature_masks, + const data_size_t feature_size, + const data_size_t* data_indices, + const data_size_t num_data, + const score_t* ordered_gradients, +#if CONST_HESSIAN == 0 + const score_t* ordered_hessians, +#else + const score_t const_hessian, +#endif + char* __restrict__ output_buf, + volatile int * sync_counters, + acc_type* __restrict__ hist_buf_base, + const size_t power_feature_workgroups) { +#endif + // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms + // otherwise a "Misaligned Address" exception may occur + __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; + const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; + const ushort ltid = threadIdx.x; + const ushort lsize = NUM_BINS; // get_local_size(0); + const ushort group_id = blockIdx.x; + + // local memory per workgroup is 3 KB + // clear local memory + uint *ptr = (uint *) shared_array; + for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { + ptr[i] = 0; + } + __syncthreads(); + // gradient/hessian histograms + // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? + // total size: 2 * 256 * size_of(float) = 2 KB + // organization: each feature/grad/hessian is at a different bank, + // as indepedent of the feature value as possible + acc_type *gh_hist = (acc_type *)shared_array; + + // counter histogram + // total size: 256 * size_of(uint) = 1 KB + uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); + + // odd threads (1, 3, ...) compute histograms for hessians first + // even thread (0, 2, ...) compute histograms for gradients first + // etc. + uchar is_hessian_first = ltid & 1; + + ushort feature_id = group_id >> power_feature_workgroups; + + // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) + // feature_size is the number of examples per feature + const uchar *feature_data = feature_data_base + feature_id * feature_size; + + // size of threads that process this feature4 + const uint subglobal_size = lsize * (1 << power_feature_workgroups); + + // equavalent thread ID in this subgroup for this feature4 + const uint subglobal_tid = gtid - feature_id * subglobal_size; + + + data_size_t ind; + data_size_t ind_next; + #ifdef IGNORE_INDICES + ind = subglobal_tid; + #else + ind = data_indices[subglobal_tid]; + #endif + + // extract feature mask, when a byte is set to 0, that feature is disabled + uchar feature_mask = feature_masks[feature_id]; + // exit if the feature is masked + if (!feature_mask) { + return; + } else { + feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + } + + // STAGE 1: read feature data, and gradient and hessian + // first half of the threads read feature data from global memory + // We will prefetch data into the "next" variable at the beginning of each iteration + uchar feature; + uchar feature_next; + //uint8_t bin; + ushort bin; + + feature = feature_data[ind >> feature_mask]; + if (feature_mask) { + feature = (feature >> ((ind & 1) << 2)) & 0xf; + } + bin = feature; + acc_type grad_bin = 0.0f, hess_bin = 0.0f; + acc_type *addr_bin; + + // store gradient and hessian + score_t grad, hess; + score_t grad_next, hess_next; + // LGBM_CUDA v5.1 + grad = ordered_gradients[ind]; + #if CONST_HESSIAN == 0 + hess = ordered_hessians[ind]; + #endif + + + // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 + for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { + // prefetch the next iteration variables + // we don't need bondary check because we have made the buffer large + #ifdef IGNORE_INDICES + // we need to check to bounds here + ind_next = i + subglobal_size < num_data ? i + subglobal_size : i; + #else + ind_next = data_indices[i + subglobal_size]; + #endif + + // imbGBT v5.1 + grad_next = ordered_gradients[ind_next]; + #if CONST_HESSIAN == 0 + hess_next = ordered_hessians[ind_next]; + #endif + + // STAGE 2: accumulate gradient and hessian + if (bin != feature) { + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + + bin = feature; + grad_bin = grad; + hess_bin = hess; + } + else { + grad_bin += grad; + hess_bin += hess; + } + + // prefetch the next iteration variables + feature_next = feature_data[ind_next >> feature_mask]; + + // STAGE 3: accumulate counter + atomicAdd(cnt_hist + feature, 1); + + // STAGE 4: update next stat + grad = grad_next; + hess = hess_next; + // LGBM_CUDA: v4.2 + if (!feature_mask) { + feature = feature_next; + } else { + feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; + } + } + + + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + __syncthreads(); + + #if CONST_HESSIAN == 1 + // make a final reduction + gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + __syncthreads(); + #endif + +#if POWER_FEATURE_WORKGROUPS != 0 + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; + // write gradients and hessians + acc_type *__restrict__ ptr_f = output; + for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { + // even threads read gradients, odd threads read hessians + // FIXME: 2-way bank conflict + acc_type value = gh_hist[i]; + ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; + } + // write counts + acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } + // FIXME: is this right + __syncthreads(); + __threadfence(); + // To avoid the cost of an extra reducting kernel, we have to deal with some + // gray area in OpenCL. We want the last work group that process this feature to + // make the final reduction, and other threads will just quit. + // This requires that the results written by other workgroups available to the + // last workgroup (memory consistency) + #if NVIDIA == 1 + // this is equavalent to CUDA __threadfence(); + // ensure the writes above goes to main memory and other workgroups can see it + asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); + #else + // FIXME: how to do the above on AMD GPUs?? + // GCN ISA says that the all writes will bypass L1 cache (write through), + // however when the last thread is reading sub-histogram data we have to + // make sure that no part of data is modified in local L1 cache of other workgroups. + // Otherwise reading can be a problem (atomic operations to get consistency). + // But in our case, the sub-histogram of this workgroup cannot be in the cache + // of another workgroup, so the following trick will work just fine. + #endif + // Now, we want one workgroup to do the final reduction. + // Other workgroups processing the same feature quit. + // The is done by using an global atomic counter. + // On AMD GPUs ideally this should be done in GDS, + // but currently there is no easy way to access it via OpenCL. + uint * counter_val = cnt_hist; + // backup the old value + uint old_val = *counter_val; + if (ltid == 0) { + // all workgroups processing the same feature add this counter + *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); + } + // make sure everyone in this workgroup is here + __syncthreads(); + // everyone in this wrokgroup: if we are the last workgroup, then do reduction! + if (*counter_val == (1 << power_feature_workgroups) - 1) { + if (ltid == 0) { + sync_counters[feature_id] = 0; + } + //} + #else + } + // only 1 work group, no need to increase counter + // the reduction will become a simple copy + if (1) { + uint old_val; // dummy + #endif + // locate our feature's block in output memory + uint output_offset = (feature_id << power_feature_workgroups); + acc_type const * __restrict__ feature_subhists = + (acc_type *)output_buf + output_offset * 3 * NUM_BINS; + // skip reading the data already in local memory + //uint skip_id = feature_id ^ output_offset; + uint skip_id = group_id - output_offset; + // locate output histogram location for this feature4 + acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS; + + + within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); + } +} + +// end of histogram64 stuff +// histogram256 stuff #ifdef ENABLE_ALL_FEATURES #ifdef IGNORE_INDICES #define KERNEL_NAME histogram256_fulldata @@ -33,13 +729,6 @@ if (b == gtid && t == ltid) { \ #define KERNEL_NAME histogram256 #endif // ENABLE_ALL_FEATURES - -// atomic add for float number in local memory -inline __device__ void atomic_local_add_f(acc_type *addr, const float val) -{ - atomicAdd(addr, static_cast(val)); -} - // this function will be called by histogram256 // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist, @@ -128,7 +817,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; const ushort ltid = threadIdx.x; - const ushort lsize = LOCAL_SIZE_0; // get_local_size(0); + const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; // local memory per workgroup is 3 KB @@ -370,3 +1059,5 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } } +// end of histogram256 stuff + diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu index 145a85367f0..eff3e51c460 100644 --- a/src/treelearner/kernels/histogram_16_64_256.hu +++ b/src/treelearner/kernels/histogram_16_64_256.hu @@ -12,8 +12,8 @@ * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. */ -#ifndef _HISTOGRAM_256_KERNEL_ -#define _HISTOGRAM_256_KERNEL_ +#ifndef _HISTOGRAM_16_64_256_KERNEL_ +#define _HISTOGRAM_16_64_256_KERNEL_ //#pragma once @@ -65,8 +65,6 @@ __device__ uchar4 as_uchar4(const T t) { return u; } - -#define LOCAL_SIZE_0 256 #define NUM_BINS 256 #if USE_DP_FLOAT == 1 typedef double acc_type; @@ -79,7 +77,6 @@ typedef uint acc_int_type; #define as_acc_type as_float #define as_acc_int_type as_uint #endif -//#define LOCAL_MEM_SIZE (4 * (sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) #define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) // unroll the atomic operation for a few times. Takes more code space, @@ -169,6 +166,20 @@ __global__ void name(const uchar* feature_data_base, \ const size_t power_feature_workgroups); +DECLARE_CONST_HES(histogram16_allfeats); +DECLARE_CONST_HES(histogram16_fulldata); +DECLARE_CONST_HES(histogram16); +DECLARE(histogram16_allfeats); +DECLARE(histogram16_fulldata); +DECLARE(histogram16); + +DECLARE_CONST_HES(histogram64_allfeats); +DECLARE_CONST_HES(histogram64_fulldata); +DECLARE_CONST_HES(histogram64); +DECLARE(histogram64_allfeats); +DECLARE(histogram64_fulldata); +DECLARE(histogram64); + DECLARE_CONST_HES(histogram256_allfeats); DECLARE_CONST_HES(histogram256_fulldata); DECLARE_CONST_HES(histogram256); From 29f6979b40aee2e7700a38eef9de1a7c13679d27 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 21 Apr 2020 21:35:43 +0000 Subject: [PATCH 024/119] Initial CUDA work --- src/treelearner/kernels/histogram_16_64_256.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 33761a2c2a4..020ca5453af 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -372,6 +372,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // end of histogram16 stuff // histogram64 stuff +#undef KERNEL_NAME #ifdef ENABLE_ALL_FEATURES #ifdef IGNORE_INDICES #define KERNEL_NAME histogram64_fulldata @@ -717,6 +718,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // end of histogram64 stuff // histogram256 stuff +#undef KERNEL_NAME #ifdef ENABLE_ALL_FEATURES #ifdef IGNORE_INDICES #define KERNEL_NAME histogram256_fulldata From 4d89fd7534023b4fd25377409709473f949ff39c Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 22 Apr 2020 18:00:07 +0000 Subject: [PATCH 025/119] Initial CUDA work --- tests/python_package_test/test_consistency.py | 4 ++++ tests/python_package_test/test_plotting.py | 2 ++ tests/python_package_test/test_sklearn.py | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py index 63a5834cf61..f6e955ee48d 100644 --- a/tests/python_package_test/test_consistency.py +++ b/tests/python_package_test/test_consistency.py @@ -68,6 +68,8 @@ class TestEngine(unittest.TestCase): def test_binary(self): fd = FileLoader('../../examples/binary_classification', 'binary') + if lgb.get_device_type() == 2: + fd.params["device"] = "cuda" X_train, y_train, _ = fd.load_dataset('.train') X_test, _, X_test_fn = fd.load_dataset('.test') weight_train = fd.load_field('.train.weight') @@ -91,6 +93,8 @@ def test_multiclass(self): def test_regression(self): fd = FileLoader('../../examples/regression', 'regression') + if lgb.get_device_type() == 2: + fd.params["device"] = "cuda" X_train, y_train, _ = fd.load_dataset('.train') X_test, _, X_test_fn = fd.load_dataset('.test') init_score_train = fd.load_field('.train.init') diff --git a/tests/python_package_test/test_plotting.py b/tests/python_package_test/test_plotting.py index 72915914fe1..13ba9859d97 100644 --- a/tests/python_package_test/test_plotting.py +++ b/tests/python_package_test/test_plotting.py @@ -24,6 +24,8 @@ def setUp(self): "verbose": -1, "num_leaves": 3 } + if lgb.get_device_type() == 2: + self.params["device"] = "cuda" @unittest.skipIf(not MATPLOTLIB_INSTALLED, 'matplotlib is not installed') def test_plot_importance(self): diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index cd50805a70b..350f3c8f486 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -453,6 +453,8 @@ def test_evaluate_train_set(self): def test_metrics(self): X, y = load_boston(True) params = {'n_estimators': 2, 'verbose': -1} + if lgb.get_device_type() == 2: + params["device"] = "cuda" params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} # no custom objective, no custom metric @@ -709,6 +711,8 @@ def test_inf_handle(self): y = np.random.randn(nrows) + np.full(nrows, 1e30) weight = np.full(nrows, 1e10) params = {'n_estimators': 20, 'verbose': -1} + if lgb.get_device_type() == 2: + params["device"] = "cuda" params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y), 'verbose': False, 'early_stopping_rounds': 5} gbm = lgb.LGBMRegressor(**params).fit(**params_fit) @@ -721,6 +725,8 @@ def test_nan_handle(self): y = np.random.randn(nrows) + np.full(nrows, 1e30) weight = np.zeros(nrows) params = {'n_estimators': 20, 'verbose': -1} + if lgb.get_device_type() == 2: + params["device"] = "cuda" params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y), 'verbose': False, 'early_stopping_rounds': 5} gbm = lgb.LGBMRegressor(**params).fit(**params_fit) From e10a46784136099c2400f8957a3b95adf2984ebc Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 22 Apr 2020 18:03:14 +0000 Subject: [PATCH 026/119] Initial CUDA work --- include/LightGBM/cuda/vector_cudahost.h | 1 + src/boosting/gbdt.cpp | 8 ++++---- src/treelearner/cuda_tree_learner.cpp | 4 ++-- src/treelearner/serial_tree_learner.cpp | 2 ++ 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index d95a353c246..61d6e464970 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -58,6 +58,7 @@ fprintf(stderr, " TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush( void deallocate(T* p, std::size_t n) { + (void)n; // UNUSED if (p==NULL) return; #ifdef USE_CUDA if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){ diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index b2fb636ac4e..55e11312235 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -385,7 +385,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { // LGBM_CUDA invoke baggging during the first iteration if ((config_->device_type == std::string("cuda")) && (iter_ == 0)) { - auto start_time = std::chrono::steady_clock::now(); +// auto start_time = std::chrono::steady_clock::now(); Bagging(0); } @@ -399,7 +399,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { } // LGBM_CUDA - auto start_time = std::chrono::steady_clock::now(); +// auto start_time = std::chrono::steady_clock::now(); Boosting(); @@ -414,7 +414,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { // LGBM_CUDA - auto start_time = std::chrono::steady_clock::now(); +// auto start_time = std::chrono::steady_clock::now(); const size_t offset = static_cast(cur_tree_id) * num_data_; std::unique_ptr new_tree(new Tree(2)); @@ -484,7 +484,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { int iter_next = iter_ + 1; if (iter_next < config_->num_iterations) { - auto start_time = std::chrono::steady_clock::now(); +// auto start_time = std::chrono::steady_clock::now(); // bagging logic Bagging(iter_next); diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 0e630389490..aa061bc289f 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -225,7 +225,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) { //#pragma omp parallel for schedule(static, num_gpu_) for(int device_id = 0; device_id < num_gpu_; ++device_id) { - auto start_time = std::chrono::steady_clock::now(); +// auto start_time = std::chrono::steady_clock::now(); // when the output is ready, the computation is done CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id])); @@ -865,7 +865,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) { // LGBM_CUDA - auto start_time = std::chrono::steady_clock::now(); +// auto start_time = std::chrono::steady_clock::now(); std::vector is_sparse_feature_used(num_features_, 0); std::vector is_dense_feature_used(num_features_, 0); diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 5d2b9afff50..3c9faa84ff5 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -27,6 +27,7 @@ SerialTreeLearner::~SerialTreeLearner() { //LGBM_CUDA void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { + (void)is_use_subset; // UNUSED train_data_ = train_data; num_data_ = train_data_->num_data(); num_features_ = train_data_->num_features(); @@ -444,6 +445,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf, int* right_leaf, int *cur_depth, bool *aborted_last_force_split) { + (void)aborted_last_force_split; bool abort_last_forced_split = false; if (forced_split_json_ == nullptr) { return 0; From 911c1b398da1d2c6f3688fe763121b1b22ce0b93 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 22 Apr 2020 21:31:20 +0000 Subject: [PATCH 027/119] Initial CUDA work --- src/treelearner/cuda_kernel_launcher.cu | 32 +++++++++---------- src/treelearner/cuda_tree_learner.cpp | 4 +-- .../kernels/histogram_16_64_256.cu | 16 ++++++++++ .../kernels/histogram_16_64_256.hu | 2 -- 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu index 4906ca7e02d..dad8b6c563b 100644 --- a/src/treelearner/cuda_kernel_launcher.cu +++ b/src/treelearner/cuda_kernel_launcher.cu @@ -32,21 +32,21 @@ if (leaf_num_data == num_data) { if (use_all_features) { if (!is_constant_hessian) - histogram16<<>>( arg0, arg1, arg2, + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else - histogram16<<>>( arg0, arg1, arg2, + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) - histogram16_fulldata<<>>( arg0, arg1, arg2, + histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else - histogram16_fulldata<<>>( arg0, arg1, arg2, + histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } @@ -55,21 +55,21 @@ if (use_all_features) { // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) - histogram16<<>>( arg0, arg1, arg2, + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else - histogram16<<>>( arg0, arg1, arg2, + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) - histogram16<<>>( arg0, arg1, arg2, + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else - histogram16<<>>( arg0, arg1, arg2, + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } @@ -79,21 +79,21 @@ if (leaf_num_data == num_data) { if (use_all_features) { if (!is_constant_hessian) - histogram64<<>>( arg0, arg1, arg2, + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else - histogram64<<>>( arg0, arg1, arg2, + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) - histogram64_fulldata<<>>( arg0, arg1, arg2, + histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else - histogram64_fulldata<<>>( arg0, arg1, arg2, + histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } @@ -102,21 +102,21 @@ if (use_all_features) { // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) - histogram64<<>>( arg0, arg1, arg2, + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else - histogram64<<>>( arg0, arg1, arg2, + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) - histogram64<<>>( arg0, arg1, arg2, + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else - histogram64<<>>( arg0, arg1, arg2, + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index aa061bc289f..50d0541ffe9 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -482,12 +482,12 @@ void CUDATreeLearner::InitGPU(int num_gpu) { // GCF XXX: resolving device_bin_size_ and histogram_size_ is the remaining work if (max_num_bin_ <= 16) { - device_bin_size_ = 256; //LGBM_CUDA + device_bin_size_ = 16; //LGBM_CUDA histogram_size_ = 16; dword_features_ = 1; // LGBM_CUDA } else if (max_num_bin_ <= 64) { - device_bin_size_ = 256; //LGBM_CUDA + device_bin_size_ = 64; //LGBM_CUDA histogram_size_ = 64; dword_features_ = 1; // LGBM_CUDA } diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 020ca5453af..a02bc1dd79f 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -38,6 +38,8 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const float val) #error "ENABLE_ALL_FEATURES should always be 1" #define KERNEL_NAME histogram16 #endif // ENABLE_ALL_FEATURES +#define NUM_BINS 16 +#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) // this function will be called by histogram16 // we have one sub-histogram of one feature in local memory, and need to read others @@ -130,6 +132,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; +//if (gtid == 0) printf("Entering the 16-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); + // local memory per workgroup is 3 KB // clear local memory uint *ptr = (uint *) shared_array; @@ -373,6 +377,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // histogram64 stuff #undef KERNEL_NAME +#undef NUM_BINS +#undef LOCAL_MEM_SIZE #ifdef ENABLE_ALL_FEATURES #ifdef IGNORE_INDICES #define KERNEL_NAME histogram64_fulldata @@ -384,6 +390,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #error "ENABLE_ALL_FEATURES should always be 1" #define KERNEL_NAME histogram64 #endif // ENABLE_ALL_FEATURES +#define NUM_BINS 64 +#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) // this function will be called by histogram64 // we have one sub-histogram of one feature in local memory, and need to read others @@ -476,6 +484,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; +//if (gtid == 0) printf("Entering the 64-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); + // local memory per workgroup is 3 KB // clear local memory uint *ptr = (uint *) shared_array; @@ -719,6 +729,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // histogram256 stuff #undef KERNEL_NAME +#undef NUM_BINS +#undef LOCAL_MEM_SIZE #ifdef ENABLE_ALL_FEATURES #ifdef IGNORE_INDICES #define KERNEL_NAME histogram256_fulldata @@ -730,6 +742,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #error "ENABLE_ALL_FEATURES should always be 1" #define KERNEL_NAME histogram256 #endif // ENABLE_ALL_FEATURES +#define NUM_BINS 256 +#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) // this function will be called by histogram256 // we have one sub-histogram of one feature in local memory, and need to read others @@ -822,6 +836,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; +//if (gtid == 0) printf("Entering the 256-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); + // local memory per workgroup is 3 KB // clear local memory uint *ptr = (uint *) shared_array; diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu index eff3e51c460..1a875588cc4 100644 --- a/src/treelearner/kernels/histogram_16_64_256.hu +++ b/src/treelearner/kernels/histogram_16_64_256.hu @@ -65,7 +65,6 @@ __device__ uchar4 as_uchar4(const T t) { return u; } -#define NUM_BINS 256 #if USE_DP_FLOAT == 1 typedef double acc_type; typedef ulong acc_int_type; @@ -77,7 +76,6 @@ typedef uint acc_int_type; #define as_acc_type as_float #define as_acc_int_type as_uint #endif -#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) // unroll the atomic operation for a few times. Takes more code space, // but compiler can generate better code for faster atomics. From 18704804f9f90793731296ba9c97ce960c692406 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Fri, 24 Apr 2020 23:41:52 +0000 Subject: [PATCH 028/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 4 +- .../kernels/histogram_16_64_256.cu | 45 ++++++++----------- 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 50d0541ffe9..2c7114aed0a 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -188,7 +188,6 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu num_workgroups, exp_workgroups_per_feature); } - for(int device_id = 0; device_id < num_gpu_; ++device_id) { if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))){ fprintf(stderr, "Error in creating threads. Exiting\n"); @@ -238,7 +237,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) { continue; } int dense_group_index = dense_feature_group_map_[i]; - auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index); + auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); for (int j = 0; j < bin_size; ++j) { @@ -480,7 +479,6 @@ void CUDATreeLearner::InitGPU(int num_gpu) { max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i)); } - // GCF XXX: resolving device_bin_size_ and histogram_size_ is the remaining work if (max_num_bin_ <= 16) { device_bin_size_ = 16; //LGBM_CUDA histogram_size_ = 16; diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index a02bc1dd79f..d8d1b626c8f 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -74,7 +74,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ } // skip the counters we already have - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; @@ -85,9 +85,8 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ __syncthreads(); - output_buf[ltid * 3 + 0] = grad_bin; - output_buf[ltid * 3 + 1] = hess_bin; - output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); + output_buf[ltid * 2 + 0] = grad_bin; + output_buf[ltid * 2 + 1] = hess_bin; } #if USE_CONSTANT_BUF == 1 @@ -132,8 +131,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; -//if (gtid == 0) printf("Entering the 16-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); - // local memory per workgroup is 3 KB // clear local memory uint *ptr = (uint *) shared_array; @@ -294,7 +291,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -361,12 +358,12 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 3 * NUM_BINS; + (acc_type *)output_buf + output_offset * 2 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; // locate output histogram location for this feature4 - acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS; + acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); @@ -426,7 +423,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ } // skip the counters we already have - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; @@ -437,9 +434,8 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ __syncthreads(); - output_buf[ltid * 3 + 0] = grad_bin; - output_buf[ltid * 3 + 1] = hess_bin; - output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); + output_buf[ltid * 2 + 0] = grad_bin; + output_buf[ltid * 2 + 1] = hess_bin; } #if USE_CONSTANT_BUF == 1 @@ -484,8 +480,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; -//if (gtid == 0) printf("Entering the 64-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); - // local memory per workgroup is 3 KB // clear local memory uint *ptr = (uint *) shared_array; @@ -646,7 +640,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -713,12 +707,12 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 3 * NUM_BINS; + (acc_type *)output_buf + output_offset * 2 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; // locate output histogram location for this feature4 - acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS; + acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); @@ -778,7 +772,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ } // skip the counters we already have - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; @@ -789,9 +783,8 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ __syncthreads(); - output_buf[ltid * 3 + 0] = grad_bin; - output_buf[ltid * 3 + 1] = hess_bin; - output_buf[ltid * 3 + 2] = as_acc_type((acc_int_type)cont_bin); + output_buf[ltid * 2 + 0] = grad_bin; + output_buf[ltid * 2 + 1] = hess_bin; } #if USE_CONSTANT_BUF == 1 @@ -836,8 +829,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; -//if (gtid == 0) printf("Entering the 256-bucket kernel, NUM_BINS = %d, block size = %d\n", NUM_BINS, blockDim.x); - // local memory per workgroup is 3 KB // clear local memory uint *ptr = (uint *) shared_array; @@ -998,7 +989,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -1065,12 +1056,12 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 3 * NUM_BINS; + (acc_type *)output_buf + output_offset * 2 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; // locate output histogram location for this feature4 - acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 3 * NUM_BINS; + acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); From 37a1a61d9ddad4bc547d51fe69d0447cf3d9311e Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 27 Apr 2020 01:37:01 +0000 Subject: [PATCH 029/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 2c7114aed0a..f1e7241a52f 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -237,7 +237,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) { continue; } int dense_group_index = dense_feature_group_map_[i]; - auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; + auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index); int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); for (int j = 0; j < bin_size; ++j) { From fcf031cac8870432054ad8e67491bab6e94034a7 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 27 Apr 2020 02:26:52 +0000 Subject: [PATCH 030/119] Initial CUDA work --- .../kernels/histogram_16_64_256.cu | 45 ++----------------- 1 file changed, 3 insertions(+), 42 deletions(-) diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index d8d1b626c8f..bb8bed2db79 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -45,7 +45,6 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const float val) // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist, const uint skip_id, - const uint old_val_cont_bin0, const ushort num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, @@ -54,14 +53,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); - uint cont_bin; - if (power_feature_workgroups != 0) { - cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; - } else { - cont_bin = local_cnt[ltid]; - } ushort i; if (power_feature_workgroups != 0) { @@ -70,7 +62,6 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have @@ -79,12 +70,10 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); - output_buf[ltid * 2 + 0] = grad_bin; output_buf[ltid * 2 + 1] = hess_bin; } @@ -335,7 +324,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // but currently there is no easy way to access it via OpenCL. uint * counter_val = cnt_hist; // backup the old value - uint old_val = *counter_val; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); @@ -353,7 +341,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { - uint old_val; // dummy #endif // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); @@ -366,7 +353,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); + within_kernel_reduction16x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); } } @@ -394,7 +381,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist, const uint skip_id, - const uint old_val_cont_bin0, const ushort num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, @@ -403,14 +389,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); - uint cont_bin; - if (power_feature_workgroups != 0) { - cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; - } else { - cont_bin = local_cnt[ltid]; - } ushort i; if (power_feature_workgroups != 0) { @@ -419,7 +398,6 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have @@ -428,12 +406,10 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); - output_buf[ltid * 2 + 0] = grad_bin; output_buf[ltid * 2 + 1] = hess_bin; } @@ -684,7 +660,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // but currently there is no easy way to access it via OpenCL. uint * counter_val = cnt_hist; // backup the old value - uint old_val = *counter_val; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); @@ -702,7 +677,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { - uint old_val; // dummy #endif // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); @@ -715,7 +689,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); + within_kernel_reduction64x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); } } @@ -743,7 +717,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist, const uint skip_id, - const uint old_val_cont_bin0, const ushort num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, @@ -752,14 +725,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); - uint cont_bin; - if (power_feature_workgroups != 0) { - cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; - } else { - cont_bin = local_cnt[ltid]; - } ushort i; if (power_feature_workgroups != 0) { @@ -768,7 +734,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have @@ -777,12 +742,10 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); - output_buf[ltid * 2 + 0] = grad_bin; output_buf[ltid * 2 + 1] = hess_bin; } @@ -1033,7 +996,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // but currently there is no easy way to access it via OpenCL. uint * counter_val = cnt_hist; // backup the old value - uint old_val = *counter_val; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); @@ -1051,7 +1013,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { - uint old_val; // dummy #endif // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); @@ -1064,7 +1025,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); + within_kernel_reduction256x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); } } From 0fb433f9a7ce14c8363ccd03a4e55bac1d6a5a46 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 27 Apr 2020 03:55:04 +0000 Subject: [PATCH 031/119] Initial CUDA work --- .../kernels/histogram_16_64_256.cu | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index bb8bed2db79..488b82f20f1 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -289,13 +289,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } - // write counts - acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i]; - ptr_i[i] = value; - } // FIXME: is this right __syncthreads(); __threadfence(); @@ -625,13 +618,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } - // write counts - acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i]; - ptr_i[i] = value; - } // FIXME: is this right __syncthreads(); __threadfence(); @@ -961,13 +947,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } - // write counts - acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i]; - ptr_i[i] = value; - } // FIXME: is this right __syncthreads(); __threadfence(); From 44819a4a3360968384fd0b55f7de26ccdf2598fe Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 28 Apr 2020 14:25:32 +0000 Subject: [PATCH 032/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 55 ++++++++++++--------------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index f1e7241a52f..f9c309394d7 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -84,60 +84,53 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, #if GPU_DEBUG > 0 void PrintHistograms(hist_t* h, size_t size) { - size_t total = 0; + double total_hess = 0; for (size_t i = 0; i < size; ++i) { - printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt); - total += h[i].cnt; - if ((i & 3) == 3) + printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i)); + if ((i & 2) == 2) printf("\n"); + total_hess += GET_HESS(h, i); } - printf("\nTotal examples: %lu\n", total); + printf("\nSum hessians: %9.3g\n", total_hess); } -union Float_t -{ +union Float_t { int64_t i; double f; static int64_t ulp_diff(Float_t a, Float_t b) { return abs(a.i - b.i); } }; - -void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) { +void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) { size_t i; Float_t a, b; for (i = 0; i < size; ++i) { - a.f = h1[i].sum_gradients; - b.f = h2[i].sum_gradients; + a.f = GET_GRAD(h1, i); + b.f = GET_GRAD(h2, i); int32_t ulps = Float_t::ulp_diff(a, b); - if (fabs(h1[i].cnt - h2[i].cnt != 0)) { - printf("idx: %lu, %d != %d, (diff: %d, err_rate: %f)\n", i, h1[i].cnt, h2[i].cnt, h1[i].cnt - h2[i].cnt, (float)(h1[i].cnt - h2[i].cnt)/h2[i].cnt); - goto err; - } else { - printf("idx: %lu, %d == %d\n", i, h1[i].cnt, h2[i].cnt); - printf("idx: %lu, pass\n", i); - } if (ulps > 0) { - printf("idx: %ld, grad %g != %g\n", i, h1[i].sum_gradients, h2[i].sum_gradients); - //printf("idx: %ld, grad %g != %g (%d ULPs)\n", i, h1[i].sum_gradients, h2[i].sum_gradients, ulps); - goto err; + // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps); + // goto err; } - a.f = h1[i].sum_hessians; - b.f = h2[i].sum_hessians; + a.f = GET_HESS(h1, i); + b.f = GET_HESS(h2, i); ulps = Float_t::ulp_diff(a, b); - if (ulps > 0) { - printf("idx: %ld, hessian %g != %g\n", i, h1[i].sum_hessians, h2[i].sum_hessians); - //printf("idx: %ld, hessian %g != %g (%d ULPs)\n", i, h1[i].sum_hessians, h2[i].sum_hessians, ulps); - // goto err; + if (std::fabs(a.f - b.f) >= 1e-20) { + printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps); + goto err; } } return; err: Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i); + std::cin.get(); + PrintHistograms(h1, size); + printf("\n"); + PrintHistograms(h2, size); + std::cin.get(); } - #endif int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { @@ -1037,17 +1030,17 @@ void CUDATreeLearner::FindBestSplits() { #if GPU_DEBUG >= 3 for (int feature_index = 0; feature_index < num_features_; ++feature_index) { - if (!is_feature_used_[feature_index]) continue; + if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue; if (parent_leaf_histogram_array_ != nullptr && !parent_leaf_histogram_array_[feature_index].is_splittable()) { smaller_leaf_histogram_array_[feature_index].set_is_splittable(false); continue; } size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; - printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d smaller leaf:\n", feature_index, bin_size); + printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd smaller leaf:\n", feature_index, bin_size); PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; } - printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%d larger leaf:\n", feature_index, bin_size); + printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd larger leaf:\n", feature_index, bin_size); PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); } From a668c8e822ca41b419b1107c8d396f8070faec29 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 28 Apr 2020 16:06:19 +0000 Subject: [PATCH 033/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index f9c309394d7..42a45a55fac 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -931,23 +931,28 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ continue; int dense_feature_group_index = dense_feature_group_map_[i]; size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index); - HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; - HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index); - HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size]; + hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; + hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2; + hist_t* gpu_histogram = new hist_t[size * 2]; data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); - - std::copy(current_histogram, current_histogram + size, gpu_histogram); - std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry)); + printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size); + std::copy(current_histogram, current_histogram + size * 2, gpu_histogram); + std::memset(current_histogram, 0, size * sizeof(hist_t) * 2); + if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) { + continue; + } if ( num_data == num_data_ ) { if ( is_constant_hessian_ ) { printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + 0, num_data, gradients_, current_histogram); } else { printf("ConstructHistogram(): num_data == num_data_ "); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + 0, num_data, gradients_, hessians_, current_histogram); @@ -957,6 +962,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ printf("ConstructHistogram(): is_constant_hessian_"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), + 0, num_data, ordered_gradients_.data(), current_histogram); @@ -964,6 +970,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ printf("ConstructHistogram(): 4"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), + 0, num_data, ordered_gradients_.data(), ordered_hessians_.data(), current_histogram); @@ -974,7 +981,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ compare = false; } CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index); - std::copy(gpu_histogram, gpu_histogram + size, current_histogram); + std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); delete [] gpu_histogram; //break; // LGBM_CUDA: see only first feature info } From 0e8cd92553f0a0f8ceb4bdf07f39be632402f880 Mon Sep 17 00:00:00 2001 From: ChipKerchner Date: Tue, 28 Apr 2020 16:11:59 -0400 Subject: [PATCH 034/119] Initial CUDA work --- src/treelearner/gpu_tree_learner.cpp | 9 +++++---- src/treelearner/gpu_tree_learner.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index 43ccadfd176..7f336b075d4 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -36,9 +36,9 @@ GPUTreeLearner::~GPUTreeLearner() { } } -void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { +void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { // initialize SerialTreeLearner - SerialTreeLearner::Init(train_data, is_constant_hessian); + SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset); // some additional variables needed for GPU trainer num_feature_groups_ = train_data_->num_feature_groups(); // Initialize GPU buffers and kernels @@ -734,8 +734,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) { SetupKernelArguments(); } -Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians) { - return SerialTreeLearner::Train(gradients, hessians); +Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians, + bool is_constant_hessian, Json& forced_split_json) { + return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json); } void GPUTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) { diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h index ba48f030441..598e8d40ac9 100644 --- a/src/treelearner/gpu_tree_learner.h +++ b/src/treelearner/gpu_tree_learner.h @@ -45,7 +45,7 @@ class GPUTreeLearner: public SerialTreeLearner { public: explicit GPUTreeLearner(const Config* tree_config); ~GPUTreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian) override; + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; void ResetIsConstantHessian(bool is_constant_hessian) override; Tree* Train(const score_t* gradients, const score_t *hessians, From 0cbe79d70e0fb9f96fa1d1eb33b174237201f904 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Sat, 2 May 2020 02:21:12 +0000 Subject: [PATCH 035/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 88 ++++++++++++++----------- src/treelearner/feature_histogram.hpp | 3 +- src/treelearner/serial_tree_learner.cpp | 6 +- 3 files changed, 54 insertions(+), 43 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 42a45a55fac..a11255eba9e 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -103,33 +103,41 @@ union Float_t { }; -void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) { - size_t i; - Float_t a, b; - for (i = 0; i < size; ++i) { - a.f = GET_GRAD(h1, i); - b.f = GET_GRAD(h2, i); - int32_t ulps = Float_t::ulp_diff(a, b); - if (ulps > 0) { - // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps); - // goto err; +void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag) { + int i; + printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size); + if (dp_flag) { // double precision + double a, b; + for (i = 0; i < (int) size; ++i) { + a = GET_GRAD(h1, i); + b = GET_GRAD(h2, i); + if (((std::fabs(a - b))/a) >= 1e-6) { + printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, a, b); + } + a = GET_HESS(h1, i); + b = (double) GET_HESS(((long long int *) h2), i); // GCF HACK becuse CPU hessians are apparently stored as long long ints + if (((std::fabs(a - b))/a) >= 1e-6) { + printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, a, b); + } } - a.f = GET_HESS(h1, i); - b.f = GET_HESS(h2, i); - ulps = Float_t::ulp_diff(a, b); - if (std::fabs(a.f - b.f) >= 1e-20) { - printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps); - goto err; + } + else { // single precision + float a, b; + for (i = 0; i < (int) size; ++i) { + a = GET_GRAD(h1, i); + b = GET_GRAD(h2, i); + if (((std::fabs(a - b))/a) >= 1e-5) { + printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, a, b); + } + a = GET_HESS(h1, i); + b = GET_HESS(h2, i); + if (((std::fabs(a - b))/a) >= 1e-5) { + printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, a, b); + } } } + printf("DONE Comparing Histograms...\n"); return; -err: - Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i); - std::cin.get(); - PrintHistograms(h1, size); - printf("\n"); - PrintHistograms(h2, size); - std::cin.get(); } #endif @@ -204,7 +212,8 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id])); + //CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpy((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost)); CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id])); } } @@ -230,7 +239,7 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) { continue; } int dense_group_index = dense_feature_group_map_[i]; - auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index); + auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); for (int j = 0; j < bin_size; ++j) { @@ -471,6 +480,9 @@ void CUDATreeLearner::InitGPU(int num_gpu) { #endif max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i)); } + #if GPU_DEBUG >= 1 + printf("\n"); + #endif if (max_num_bin_ <= 16) { device_bin_size_ = 16; //LGBM_CUDA @@ -831,7 +843,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( printf("%d ", feature_masks_[i]); } printf("\n"); - printf("CudaTreeLearner::ConstructGPUHistogramsAsync() %d feature groups, %d used, %d\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features); + printf("CudaTreeLearner::ConstructGPUHistogramsAsync() %d feature groups, %d used, %d use_all_features\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features); #endif // if not all feature groups are used, we need to transfer the feature mask to GPU @@ -877,7 +889,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } // construct smaller leaf - hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; + hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; // Check workgroups per feature4 tuple.. int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); @@ -924,7 +936,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem // #define GPU_DEBUG_COMPARE #ifdef GPU_DEBUG_COMPARE - printf("Start Comparing_Histogram between GPU and CPU num_dense_feature_groups_=%d\n",num_dense_feature_groups_); + printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n",num_dense_feature_groups_); bool compare = true; for (int i = 0; i < num_dense_feature_groups_; ++i) { if (!feature_masks_[i]) @@ -935,7 +947,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2; hist_t* gpu_histogram = new hist_t[size * 2]; data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); - printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size); + printf("Comparing histogram for feature %d, size %d, %lu bins\n", dense_feature_group_index, num_data, size); std::copy(current_histogram, current_histogram + size * 2, gpu_histogram); std::memset(current_histogram, 0, size * sizeof(hist_t) * 2); if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) { @@ -943,14 +955,14 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } if ( num_data == num_data_ ) { if ( is_constant_hessian_ ) { - printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_"); + printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( 0, num_data, gradients_, current_histogram); } else { - printf("ConstructHistogram(): num_data == num_data_ "); + printf("ConstructHistogram(): num_data == num_data_\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( 0, num_data, @@ -959,7 +971,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } } else { if ( is_constant_hessian_ ) { - printf("ConstructHistogram(): is_constant_hessian_"); + printf("ConstructHistogram(): is_constant_hessian_\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), 0, @@ -967,7 +979,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ ordered_gradients_.data(), current_histogram); } else { - printf("ConstructHistogram(): 4"); + printf("ConstructHistogram(): 4\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), 0, @@ -977,10 +989,10 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } } if ( (num_data != num_data_) && compare ) { - CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index); + CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp); compare = false; } - CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index); + CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp); std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); delete [] gpu_histogram; //break; // LGBM_CUDA: see only first feature info @@ -993,7 +1005,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // construct larger leaf - hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1; + hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset; is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf()); @@ -1009,11 +1021,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, // ptr_larger_leaf_hist_data); train_data_->ConstructHistograms(is_sparse_feature_used, - smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), + larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), - ptr_smaller_leaf_hist_data); + ptr_larger_leaf_hist_data); } // wait for GPU to finish, only if GPU is actually used diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 8916ee48fd4..bf3d81c53d8 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -1201,7 +1201,8 @@ class HistogramPool { for (int i = old_cache_size; i < cache_size; ++i) { OMP_LOOP_EX_BEGIN(); pool_[i].reset(new FeatureHistogram[train_data->num_features()]); - data_[i].resize(num_total_bin * 2); + //data_[i].resize(num_total_bin * 2); + data_[i].resize(num_total_bin * 3); // GCF HACK to avoid mysterious core dumps for (int j = 0; j < train_data->num_features(); ++j) { pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]); } diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 3c9faa84ff5..e5d2a64ceaf 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -353,8 +353,7 @@ void SerialTreeLearner::ConstructHistograms( Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer); // construct smaller leaf - hist_t* ptr_smaller_leaf_hist_data = - smaller_leaf_histogram_array_[0].RawData() - kHistOffset; + hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; train_data_->ConstructHistograms( is_feature_used, smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, @@ -363,8 +362,7 @@ void SerialTreeLearner::ConstructHistograms( if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { // construct larger leaf - hist_t* ptr_larger_leaf_hist_data = - larger_leaf_histogram_array_[0].RawData() - kHistOffset; + hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset; train_data_->ConstructHistograms( is_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, From 252f465fbcb61a5458fe126ede4e0f76be6aaadb Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 4 May 2020 16:44:32 +0000 Subject: [PATCH 036/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index a11255eba9e..85e90e402e5 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -877,6 +877,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ #pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue; if (!is_feature_used[feature_index]) continue; if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) { is_sparse_feature_used[feature_index] = 1; From f7d8fb4400257426a646f1736260cbe412e5992e Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 4 May 2020 21:24:11 +0000 Subject: [PATCH 037/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 88 +++++++++++++------ .../kernels/histogram_16_64_256.cu | 83 ++++++++++++++++- 2 files changed, 140 insertions(+), 31 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 85e90e402e5..405cbc89022 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -102,42 +102,68 @@ union Float_t { } }; - -void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag) { +int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) { int i; + int retval = 0; printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size); if (dp_flag) { // double precision - double a, b; + double af, bf; + long long int ai, bi; for (i = 0; i < (int) size; ++i) { - a = GET_GRAD(h1, i); - b = GET_GRAD(h2, i); - if (((std::fabs(a - b))/a) >= 1e-6) { - printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, a, b); + af = GET_GRAD(h1, i); + bf = GET_GRAD(h2, i); + if (((std::fabs(af - bf))/af) >= 1e-6) { + printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf); + ++retval; + } + if (const_flag) { + ai = GET_HESS(((long long int *) h1), i); + bi = GET_HESS(((long long int *) h2), i); + if (ai != bi) { + printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi); + ++retval; + } } - a = GET_HESS(h1, i); - b = (double) GET_HESS(((long long int *) h2), i); // GCF HACK becuse CPU hessians are apparently stored as long long ints - if (((std::fabs(a - b))/a) >= 1e-6) { - printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, a, b); + else { + af = GET_HESS(h1, i); + bf = GET_HESS(h2, i); + if (((std::fabs(af - bf))/af) >= 1e-6) { + printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, af, bf); + ++retval; + } } } } else { // single precision - float a, b; + float af, bf; + int ai, bi; for (i = 0; i < (int) size; ++i) { - a = GET_GRAD(h1, i); - b = GET_GRAD(h2, i); - if (((std::fabs(a - b))/a) >= 1e-5) { - printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, a, b); + af = GET_GRAD(h1, i); + bf = GET_GRAD(h2, i); + if (((std::fabs(af - bf))/af) >= 1e-5) { + printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf); + ++retval; + } + if (const_flag) { + ai = GET_HESS(h1, i); + bi = GET_HESS(h2, i); + if (ai != bi) { + printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi); + ++retval; + } } - a = GET_HESS(h1, i); - b = GET_HESS(h2, i); - if (((std::fabs(a - b))/a) >= 1e-5) { - printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, a, b); + else { + af = GET_HESS(h1, i); + bf = GET_HESS(h2, i); + if (((std::fabs(af - bf))/af) >= 1e-5) { + printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, af, bf); + ++retval; + } } } } printf("DONE Comparing Histograms...\n"); - return; + return retval; } #endif @@ -948,7 +974,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2; hist_t* gpu_histogram = new hist_t[size * 2]; data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); - printf("Comparing histogram for feature %d, size %d, %lu bins\n", dense_feature_group_index, num_data, size); + printf("Comparing histogram for feature %d, num_data %d, num_data_ = %d, %lu bins\n", dense_feature_group_index, num_data, num_data_, size); std::copy(current_histogram, current_histogram + size * 2, gpu_histogram); std::memset(current_histogram, 0, size * sizeof(hist_t) * 2); if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) { @@ -980,7 +1006,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ ordered_gradients_.data(), current_histogram); } else { - printf("ConstructHistogram(): 4\n"); + printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), 0, @@ -989,11 +1015,19 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ current_histogram); } } + int retval; if ( (num_data != num_data_) && compare ) { - CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp); + retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); + if (retval < 4) printf("CompareHistograms reports only %d errors\n", retval); compare = false; } - CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp); + retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); + if (num_data == num_data_) { + if (retval > 1) printf("CompareHistograms reports %d errors\n", retval); + } + else { + if (retval < 3) printf("CompareHistograms reports only %d errors\n", retval); + } std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); delete [] gpu_histogram; //break; // LGBM_CUDA: see only first feature info @@ -1058,11 +1092,11 @@ void CUDATreeLearner::FindBestSplits() { } size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd smaller leaf:\n", feature_index, bin_size); - PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); + PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size); if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; } printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd larger leaf:\n", feature_index, bin_size); - PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); + PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset1, bin_size); } #endif } diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 488b82f20f1..5ea8721cf99 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -45,6 +45,7 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const float val) // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist, const uint skip_id, + const uint old_val_cont_bin0, const ushort num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, @@ -53,7 +54,14 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; + uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + uint cont_bin; + if (power_feature_workgroups != 0) { + cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; + } else { + cont_bin = local_cnt[ltid]; + } ushort i; if (power_feature_workgroups != 0) { @@ -62,6 +70,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have @@ -70,12 +79,19 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); + output_buf[ltid * 2 + 0] = grad_bin; +#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; +#else + output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); +#endif + output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -289,6 +305,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } + // write counts + acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } // FIXME: is this right __syncthreads(); __threadfence(); @@ -317,6 +340,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // but currently there is no easy way to access it via OpenCL. uint * counter_val = cnt_hist; // backup the old value + uint old_val = *counter_val; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); @@ -334,6 +358,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { + uint old_val; // dummy #endif // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); @@ -346,7 +371,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - within_kernel_reduction16x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); + within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); } } @@ -374,6 +399,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist, const uint skip_id, + const uint old_val_cont_bin0, const ushort num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, @@ -382,7 +408,14 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; + uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + uint cont_bin; + if (power_feature_workgroups != 0) { + cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; + } else { + cont_bin = local_cnt[ltid]; + } ushort i; if (power_feature_workgroups != 0) { @@ -391,6 +424,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have @@ -399,12 +433,19 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); + output_buf[ltid * 2 + 0] = grad_bin; - output_buf[ltid * 2 + 1] = hess_bin; +#if CONST_HESSIAN == 0 + output_buf[ltid * 2 + 1] = hess_bin; +#else + output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); +#endif + output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -618,6 +659,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } + // write counts + acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } // FIXME: is this right __syncthreads(); __threadfence(); @@ -646,6 +694,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // but currently there is no easy way to access it via OpenCL. uint * counter_val = cnt_hist; // backup the old value + uint old_val = *counter_val; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); @@ -663,6 +712,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { + uint old_val; // dummy #endif // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); @@ -675,7 +725,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - within_kernel_reduction64x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); + within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); } } @@ -703,6 +753,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist, const uint skip_id, + const uint old_val_cont_bin0, const ushort num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, @@ -711,7 +762,14 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; + uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + uint cont_bin; + if (power_feature_workgroups != 0) { + cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; + } else { + cont_bin = local_cnt[ltid]; + } ushort i; if (power_feature_workgroups != 0) { @@ -720,6 +778,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have @@ -728,12 +787,19 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); + output_buf[ltid * 2 + 0] = grad_bin; +#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; +#else + output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); +#endif + output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -947,6 +1013,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } + // write counts + acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } // FIXME: is this right __syncthreads(); __threadfence(); @@ -975,6 +1048,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // but currently there is no easy way to access it via OpenCL. uint * counter_val = cnt_hist; // backup the old value + uint old_val = *counter_val; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); @@ -992,6 +1066,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { + uint old_val; // dummy #endif // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); @@ -1004,7 +1079,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - within_kernel_reduction256x4(feature_subhists, skip_id, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); + within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); } } From be09b8f64e71db9466b5014eba7430b564196544 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 5 May 2020 03:55:33 +0000 Subject: [PATCH 038/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 4 ++-- .../kernels/histogram_16_64_256.cu | 24 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 405cbc89022..4c1f57cf998 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -208,7 +208,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu if (num_workgroups > preallocd_max_num_wg_[device_id]) { preallocd_max_num_wg_.at(device_id) = num_workgroups; CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id])); - cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_); + cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)); } //set thread_data SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features, @@ -413,7 +413,7 @@ void CUDATreeLearner::AllocateGPUMemory() { if (!device_subhistograms_[device_id]) { // only initialize once here, as this will not need to change when ResetTrainingData() is called - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_)); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))); Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]); diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 5ea8721cf99..02a02fed745 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -74,7 +74,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ } // skip the counters we already have - p += 2 * NUM_BINS; + p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; @@ -91,7 +91,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ #else output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); #endif - output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); + //output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -296,7 +296,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -363,7 +363,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 2 * NUM_BINS; + (acc_type *)output_buf + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; @@ -428,7 +428,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ } // skip the counters we already have - p += 2 * NUM_BINS; + p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; @@ -445,7 +445,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ #else output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); #endif - output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); +// output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -650,7 +650,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -717,7 +717,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 2 * NUM_BINS; + (acc_type *)output_buf + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; @@ -782,7 +782,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ } // skip the counters we already have - p += 2 * NUM_BINS; + p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; @@ -799,7 +799,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ #else output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); #endif - output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); +// output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -1004,7 +1004,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -1071,7 +1071,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 2 * NUM_BINS; + (acc_type *)output_buf + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; From 3d6addd279c7990c0a06b3d18264ee28b6b91754 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 5 May 2020 15:46:05 +0000 Subject: [PATCH 039/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 4c1f57cf998..d84175f3377 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -1115,8 +1115,8 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); } } else { - smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); - larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); + smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, best_split_info.right_output); + larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, best_split_info.left_output); if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) || (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) { Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); From 0025bed6b90de37e5d1c6b5b60911c7c5309a8fb Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Fri, 8 May 2020 00:36:07 +0000 Subject: [PATCH 040/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 8 +- .../kernels/histogram_16_64_256.cu | 105 ++++-------------- 2 files changed, 28 insertions(+), 85 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index d84175f3377..7e2d7418a2b 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -208,7 +208,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu if (num_workgroups > preallocd_max_num_wg_[device_id]) { preallocd_max_num_wg_.at(device_id) = num_workgroups; CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id])); - cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)); + cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_); } //set thread_data SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features, @@ -413,7 +413,7 @@ void CUDATreeLearner::AllocateGPUMemory() { if (!device_subhistograms_[device_id]) { // only initialize once here, as this will not need to change when ResetTrainingData() is called - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_)); Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]); @@ -1003,7 +1003,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ smaller_leaf_splits_->data_indices(), 0, num_data, - ordered_gradients_.data(), + gradients_, current_histogram); } else { printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_); @@ -1011,7 +1011,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ smaller_leaf_splits_->data_indices(), 0, num_data, - ordered_gradients_.data(), ordered_hessians_.data(), + gradients_, hessians_, current_histogram); } } diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 02a02fed745..e0ac3abfc2d 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -54,14 +54,6 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); - - uint cont_bin; - if (power_feature_workgroups != 0) { - cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; - } else { - cont_bin = local_cnt[ltid]; - } ushort i; if (power_feature_workgroups != 0) { @@ -70,28 +62,20 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); - output_buf[ltid * 2 + 0] = grad_bin; -#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; -#else - output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); -#endif - //output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -152,7 +136,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // counter histogram // total size: 256 * size_of(uint) = 1 KB + #if CONST_HESSIAN == 1 uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); + #endif // odd threads (1, 3, ...) compute histograms for hessians first // even thread (0, 2, ...) compute histograms for gradients first @@ -259,8 +245,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // prefetch the next iteration variables feature_next = feature_data[ind_next >> feature_mask]; + #if CONST_HESSIAN == 1 // STAGE 3: accumulate counter atomicAdd(cnt_hist + feature, 1); + #endif // STAGE 4: update next stat grad = grad_next; @@ -296,7 +284,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -305,13 +293,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } - // write counts - acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i]; - ptr_i[i] = value; - } // FIXME: is this right __syncthreads(); __threadfence(); @@ -338,7 +319,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // The is done by using an global atomic counter. // On AMD GPUs ideally this should be done in GDS, // but currently there is no easy way to access it via OpenCL. - uint * counter_val = cnt_hist; + uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS); // backup the old value uint old_val = *counter_val; if (ltid == 0) { @@ -363,7 +344,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 3 * NUM_BINS; + (acc_type *)output_buf + output_offset * 2 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; @@ -408,14 +389,6 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); - - uint cont_bin; - if (power_feature_workgroups != 0) { - cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; - } else { - cont_bin = local_cnt[ltid]; - } ushort i; if (power_feature_workgroups != 0) { @@ -424,28 +397,20 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); - output_buf[ltid * 2 + 0] = grad_bin; -#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; -#else - output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); -#endif -// output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -506,7 +471,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // counter histogram // total size: 256 * size_of(uint) = 1 KB + #if CONST_HESSIAN == 1 uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); + #endif // odd threads (1, 3, ...) compute histograms for hessians first // even thread (0, 2, ...) compute histograms for gradients first @@ -613,8 +580,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // prefetch the next iteration variables feature_next = feature_data[ind_next >> feature_mask]; + #if CONST_HESSIAN == 1 // STAGE 3: accumulate counter atomicAdd(cnt_hist + feature, 1); + #endif // STAGE 4: update next stat grad = grad_next; @@ -650,7 +619,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -659,13 +628,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } - // write counts - acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i]; - ptr_i[i] = value; - } // FIXME: is this right __syncthreads(); __threadfence(); @@ -692,7 +654,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // The is done by using an global atomic counter. // On AMD GPUs ideally this should be done in GDS, // but currently there is no easy way to access it via OpenCL. - uint * counter_val = cnt_hist; + uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS); // backup the old value uint old_val = *counter_val; if (ltid == 0) { @@ -717,7 +679,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 3 * NUM_BINS; + (acc_type *)output_buf + output_offset * 2 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; @@ -762,14 +724,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); - - uint cont_bin; - if (power_feature_workgroups != 0) { - cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; - } else { - cont_bin = local_cnt[ltid]; - } ushort i; if (power_feature_workgroups != 0) { @@ -778,28 +732,20 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); - output_buf[ltid * 2 + 0] = grad_bin; -#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; -#else - output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); -#endif -// output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -860,7 +806,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // counter histogram // total size: 256 * size_of(uint) = 1 KB + #if CONST_HESSIAN == 1 uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); + #endif // odd threads (1, 3, ...) compute histograms for hessians first // even thread (0, 2, ...) compute histograms for gradients first @@ -967,8 +915,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // prefetch the next iteration variables feature_next = feature_data[ind_next >> feature_mask]; + #if CONST_HESSIAN == 1 // STAGE 3: accumulate counter atomicAdd(cnt_hist + feature, 1); + #endif // STAGE 4: update next stat grad = grad_next; @@ -1004,7 +954,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -1013,13 +963,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } - // write counts - acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i]; - ptr_i[i] = value; - } // FIXME: is this right __syncthreads(); __threadfence(); @@ -1046,7 +989,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // The is done by using an global atomic counter. // On AMD GPUs ideally this should be done in GDS, // but currently there is no easy way to access it via OpenCL. - uint * counter_val = cnt_hist; + uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS); // backup the old value uint old_val = *counter_val; if (ltid == 0) { @@ -1071,7 +1014,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 3 * NUM_BINS; + (acc_type *)output_buf + output_offset * 2 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; From b70604d5395dfc3472b394caeec17d892aaaa536 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Fri, 8 May 2020 01:31:18 +0000 Subject: [PATCH 041/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 48 ++------------------------- 1 file changed, 3 insertions(+), 45 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 7e2d7418a2b..e12b15075d0 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -102,9 +102,8 @@ union Float_t { } }; -int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) { +void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) { int i; - int retval = 0; printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size); if (dp_flag) { // double precision double af, bf; @@ -114,14 +113,12 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d bf = GET_GRAD(h2, i); if (((std::fabs(af - bf))/af) >= 1e-6) { printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf); - ++retval; } if (const_flag) { ai = GET_HESS(((long long int *) h1), i); bi = GET_HESS(((long long int *) h2), i); if (ai != bi) { printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi); - ++retval; } } else { @@ -129,7 +126,6 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d bf = GET_HESS(h2, i); if (((std::fabs(af - bf))/af) >= 1e-6) { printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, af, bf); - ++retval; } } } @@ -142,14 +138,12 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d bf = GET_GRAD(h2, i); if (((std::fabs(af - bf))/af) >= 1e-5) { printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf); - ++retval; } if (const_flag) { ai = GET_HESS(h1, i); bi = GET_HESS(h2, i); if (ai != bi) { printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi); - ++retval; } } else { @@ -157,13 +151,11 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d bf = GET_HESS(h2, i); if (((std::fabs(af - bf))/af) >= 1e-5) { printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, af, bf); - ++retval; } } } } printf("DONE Comparing Histograms...\n"); - return retval; } #endif @@ -962,9 +954,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem // #define GPU_DEBUG_COMPARE -#ifdef GPU_DEBUG_COMPARE + #ifdef GPU_DEBUG_COMPARE printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n",num_dense_feature_groups_); - bool compare = true; for (int i = 0; i < num_dense_feature_groups_; ++i) { if (!feature_masks_[i]) continue; @@ -981,31 +972,13 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ continue; } if ( num_data == num_data_ ) { - if ( is_constant_hessian_ ) { - printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n"); - train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( - 0, - num_data, - gradients_, - current_histogram); - } else { printf("ConstructHistogram(): num_data == num_data_\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( 0, num_data, gradients_, hessians_, current_histogram); - } } else { - if ( is_constant_hessian_ ) { - printf("ConstructHistogram(): is_constant_hessian_\n"); - train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( - smaller_leaf_splits_->data_indices(), - 0, - num_data, - gradients_, - current_histogram); - } else { printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), @@ -1013,27 +986,12 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ num_data, gradients_, hessians_, current_histogram); - } - } - int retval; - if ( (num_data != num_data_) && compare ) { - retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); - if (retval < 4) printf("CompareHistograms reports only %d errors\n", retval); - compare = false; - } - retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); - if (num_data == num_data_) { - if (retval > 1) printf("CompareHistograms reports %d errors\n", retval); - } - else { - if (retval < 3) printf("CompareHistograms reports only %d errors\n", retval); } + CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); delete [] gpu_histogram; - //break; // LGBM_CUDA: see only first feature info } printf("End Comparing Histogram between GPU and CPU\n"); -// #endif #endif if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { From d92739df3052b3df6bedf66940575e87afb739ac Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 11 May 2020 18:25:59 +0000 Subject: [PATCH 042/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 52 +++++++- .../kernels/histogram_16_64_256.cu | 117 +++++++++++++----- 2 files changed, 134 insertions(+), 35 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index e12b15075d0..53eec14fbc2 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -102,8 +102,9 @@ union Float_t { } }; -void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) { +int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) { int i; + int retval = 0; printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size); if (dp_flag) { // double precision double af, bf; @@ -113,12 +114,14 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int bf = GET_GRAD(h2, i); if (((std::fabs(af - bf))/af) >= 1e-6) { printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf); + ++retval; } if (const_flag) { ai = GET_HESS(((long long int *) h1), i); bi = GET_HESS(((long long int *) h2), i); if (ai != bi) { printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi); + ++retval; } } else { @@ -126,6 +129,7 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int bf = GET_HESS(h2, i); if (((std::fabs(af - bf))/af) >= 1e-6) { printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, af, bf); + ++retval; } } } @@ -138,12 +142,14 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int bf = GET_GRAD(h2, i); if (((std::fabs(af - bf))/af) >= 1e-5) { printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf); + ++retval; } if (const_flag) { ai = GET_HESS(h1, i); bi = GET_HESS(h2, i); if (ai != bi) { printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi); + ++retval; } } else { @@ -151,11 +157,13 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int bf = GET_HESS(h2, i); if (((std::fabs(af - bf))/af) >= 1e-5) { printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, af, bf); + ++retval; } } } } printf("DONE Comparing Histograms...\n"); + return retval; } #endif @@ -200,7 +208,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu if (num_workgroups > preallocd_max_num_wg_[device_id]) { preallocd_max_num_wg_.at(device_id) = num_workgroups; CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id])); - cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * hist_bin_entry_sz_); + cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)); } //set thread_data SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features, @@ -405,7 +413,7 @@ void CUDATreeLearner::AllocateGPUMemory() { if (!device_subhistograms_[device_id]) { // only initialize once here, as this will not need to change when ResetTrainingData() is called - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_)); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))); Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]); @@ -954,8 +962,9 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem // #define GPU_DEBUG_COMPARE - #ifdef GPU_DEBUG_COMPARE +#ifdef GPU_DEBUG_COMPARE printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n",num_dense_feature_groups_); + bool compare = true; for (int i = 0; i < num_dense_feature_groups_; ++i) { if (!feature_masks_[i]) continue; @@ -972,13 +981,31 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ continue; } if ( num_data == num_data_ ) { + if ( is_constant_hessian_ ) { + printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n"); + train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + 0, + num_data, + gradients_, + current_histogram); + } else { printf("ConstructHistogram(): num_data == num_data_\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( 0, num_data, gradients_, hessians_, current_histogram); + } } else { + if ( is_constant_hessian_ ) { + printf("ConstructHistogram(): is_constant_hessian_\n"); + train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + smaller_leaf_splits_->data_indices(), + 0, + num_data, + gradients_, + current_histogram); + } else { printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), @@ -986,12 +1013,27 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ num_data, gradients_, hessians_, current_histogram); + } + } + int retval; + if ( (num_data != num_data_) && compare ) { + retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); + printf("CompareHistograms reports %d errors\n", retval); + compare = false; + } + retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); + if (num_data == num_data_) { + printf("CompareHistograms reports %d errors\n", retval); + } + else { + printf("CompareHistograms reports %d errors\n", retval); } - CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); delete [] gpu_histogram; + //break; // LGBM_CUDA: see only first feature info } printf("End Comparing Histogram between GPU and CPU\n"); +// #endif #endif if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index e0ac3abfc2d..3c194a22ddf 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -54,6 +54,14 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; + uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + + uint cont_bin; + if (power_feature_workgroups != 0) { + cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; + } else { + cont_bin = local_cnt[ltid]; + } ushort i; if (power_feature_workgroups != 0) { @@ -62,20 +70,28 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have - p += 2 * NUM_BINS; + p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); + output_buf[ltid * 2 + 0] = grad_bin; +#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; +#else + output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); +#endif + //output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -136,9 +152,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // counter histogram // total size: 256 * size_of(uint) = 1 KB - #if CONST_HESSIAN == 1 uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); - #endif // odd threads (1, 3, ...) compute histograms for hessians first // even thread (0, 2, ...) compute histograms for gradients first @@ -195,9 +209,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, score_t grad, hess; score_t grad_next, hess_next; // LGBM_CUDA v5.1 - grad = ordered_gradients[ind]; + grad = ordered_gradients[subglobal_tid]; #if CONST_HESSIAN == 0 - hess = ordered_hessians[ind]; + hess = ordered_hessians[subglobal_tid]; #endif @@ -245,10 +259,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // prefetch the next iteration variables feature_next = feature_data[ind_next >> feature_mask]; - #if CONST_HESSIAN == 1 // STAGE 3: accumulate counter atomicAdd(cnt_hist + feature, 1); - #endif // STAGE 4: update next stat grad = grad_next; @@ -284,7 +296,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -293,6 +305,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } + // write counts + acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } // FIXME: is this right __syncthreads(); __threadfence(); @@ -319,7 +338,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // The is done by using an global atomic counter. // On AMD GPUs ideally this should be done in GDS, // but currently there is no easy way to access it via OpenCL. - uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS); + uint * counter_val = cnt_hist; // backup the old value uint old_val = *counter_val; if (ltid == 0) { @@ -344,7 +363,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 2 * NUM_BINS; + (acc_type *)output_buf + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; @@ -389,6 +408,14 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; + uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + + uint cont_bin; + if (power_feature_workgroups != 0) { + cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; + } else { + cont_bin = local_cnt[ltid]; + } ushort i; if (power_feature_workgroups != 0) { @@ -397,20 +424,28 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have - p += 2 * NUM_BINS; + p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); + output_buf[ltid * 2 + 0] = grad_bin; +#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; +#else + output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); +#endif +// output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -471,9 +506,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // counter histogram // total size: 256 * size_of(uint) = 1 KB - #if CONST_HESSIAN == 1 uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); - #endif // odd threads (1, 3, ...) compute histograms for hessians first // even thread (0, 2, ...) compute histograms for gradients first @@ -530,9 +563,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, score_t grad, hess; score_t grad_next, hess_next; // LGBM_CUDA v5.1 - grad = ordered_gradients[ind]; + grad = ordered_gradients[subglobal_tid]; #if CONST_HESSIAN == 0 - hess = ordered_hessians[ind]; + hess = ordered_hessians[subglobal_tid]; #endif @@ -580,10 +613,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // prefetch the next iteration variables feature_next = feature_data[ind_next >> feature_mask]; - #if CONST_HESSIAN == 1 // STAGE 3: accumulate counter atomicAdd(cnt_hist + feature, 1); - #endif // STAGE 4: update next stat grad = grad_next; @@ -619,7 +650,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -628,6 +659,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } + // write counts + acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } // FIXME: is this right __syncthreads(); __threadfence(); @@ -654,7 +692,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // The is done by using an global atomic counter. // On AMD GPUs ideally this should be done in GDS, // but currently there is no easy way to access it via OpenCL. - uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS); + uint * counter_val = cnt_hist; // backup the old value uint old_val = *counter_val; if (ltid == 0) { @@ -679,7 +717,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 2 * NUM_BINS; + (acc_type *)output_buf + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; @@ -724,6 +762,14 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; + uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + + uint cont_bin; + if (power_feature_workgroups != 0) { + cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; + } else { + cont_bin = local_cnt[ltid]; + } ushort i; if (power_feature_workgroups != 0) { @@ -732,20 +778,28 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ for (i = 0; i < skip_id; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } // skip the counters we already have - p += 2 * NUM_BINS; + p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; + cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } __syncthreads(); + output_buf[ltid * 2 + 0] = grad_bin; +#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; +#else + output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); +#endif +// output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -806,9 +860,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // counter histogram // total size: 256 * size_of(uint) = 1 KB - #if CONST_HESSIAN == 1 uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); - #endif // odd threads (1, 3, ...) compute histograms for hessians first // even thread (0, 2, ...) compute histograms for gradients first @@ -865,9 +917,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, score_t grad, hess; score_t grad_next, hess_next; // LGBM_CUDA v5.1 - grad = ordered_gradients[ind]; + grad = ordered_gradients[subglobal_tid]; #if CONST_HESSIAN == 0 - hess = ordered_hessians[ind]; + hess = ordered_hessians[subglobal_tid]; #endif @@ -915,10 +967,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // prefetch the next iteration variables feature_next = feature_data[ind_next >> feature_mask]; - #if CONST_HESSIAN == 1 // STAGE 3: accumulate counter atomicAdd(cnt_hist + feature, 1); - #endif // STAGE 4: update next stat grad = grad_next; @@ -954,7 +1004,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 2 * NUM_BINS; + acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { @@ -963,6 +1013,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } + // write counts + acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } // FIXME: is this right __syncthreads(); __threadfence(); @@ -989,7 +1046,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // The is done by using an global atomic counter. // On AMD GPUs ideally this should be done in GDS, // but currently there is no easy way to access it via OpenCL. - uint * counter_val = (uint *)(gh_hist + 2 * NUM_BINS); + uint * counter_val = cnt_hist; // backup the old value uint old_val = *counter_val; if (ltid == 0) { @@ -1014,7 +1071,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 2 * NUM_BINS; + (acc_type *)output_buf + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory //uint skip_id = feature_id ^ output_offset; uint skip_id = group_id - output_offset; From 429e752269c98d283a51f95853c476480124c1c1 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 12 May 2020 02:48:55 +0000 Subject: [PATCH 043/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 6 ++-- .../kernels/histogram_16_64_256.cu | 33 +++++++++---------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 53eec14fbc2..d87e7addef1 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -112,7 +112,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d for (i = 0; i < (int) size; ++i) { af = GET_GRAD(h1, i); bf = GET_GRAD(h2, i); - if (((std::fabs(af - bf))/af) >= 1e-6) { + if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) { printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf); ++retval; } @@ -140,7 +140,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d for (i = 0; i < (int) size; ++i) { af = GET_GRAD(h1, i); bf = GET_GRAD(h2, i); - if (((std::fabs(af - bf))/af) >= 1e-5) { + if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) { printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf); ++retval; } @@ -1033,6 +1033,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ //break; // LGBM_CUDA: see only first feature info } printf("End Comparing Histogram between GPU and CPU\n"); + fflush(stderr); + fflush(stdout); // #endif #endif diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 3c194a22ddf..a85918cc3c6 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -91,7 +91,6 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ #else output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); #endif - //output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -219,17 +218,18 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { // prefetch the next iteration variables // we don't need bondary check because we have made the buffer large + int i_next = i + subglobal_size; #ifdef IGNORE_INDICES // we need to check to bounds here - ind_next = i + subglobal_size < num_data ? i + subglobal_size : i; + ind_next = i_next < num_data ? i_next : i; #else - ind_next = data_indices[i + subglobal_size]; + ind_next = data_indices[i_next]; #endif // imbGBT v5.1 - grad_next = ordered_gradients[ind_next]; + grad_next = ordered_gradients[i_next]; #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[ind_next]; + hess_next = ordered_hessians[i_next]; #endif // STAGE 2: accumulate gradient and hessian @@ -441,11 +441,10 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ output_buf[ltid * 2 + 0] = grad_bin; #if CONST_HESSIAN == 0 - output_buf[ltid * 2 + 1] = hess_bin; + output_buf[ltid * 2 + 1] = hess_bin; #else output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); #endif -// output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -573,17 +572,18 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { // prefetch the next iteration variables // we don't need bondary check because we have made the buffer large + int i_next = i + subglobal_size; #ifdef IGNORE_INDICES // we need to check to bounds here - ind_next = i + subglobal_size < num_data ? i + subglobal_size : i; + ind_next = i_next < num_data ? i_next : i; #else - ind_next = data_indices[i + subglobal_size]; + ind_next = data_indices[i_next]; #endif // imbGBT v5.1 - grad_next = ordered_gradients[ind_next]; + grad_next = ordered_gradients[i_next]; #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[ind_next]; + hess_next = ordered_hessians[i_next]; #endif // STAGE 2: accumulate gradient and hessian @@ -792,14 +792,12 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ } __syncthreads(); - output_buf[ltid * 2 + 0] = grad_bin; #if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; #else output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); #endif -// output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); } #if USE_CONSTANT_BUF == 1 @@ -927,17 +925,18 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { // prefetch the next iteration variables // we don't need bondary check because we have made the buffer large + int i_next = i + subglobal_size; #ifdef IGNORE_INDICES // we need to check to bounds here - ind_next = i + subglobal_size < num_data ? i + subglobal_size : i; + ind_next = i_next < num_data ? i_next : i; #else - ind_next = data_indices[i + subglobal_size]; + ind_next = data_indices[i_next]; #endif // imbGBT v5.1 - grad_next = ordered_gradients[ind_next]; + grad_next = ordered_gradients[i_next]; #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[ind_next]; + hess_next = ordered_hessians[i_next]; #endif // STAGE 2: accumulate gradient and hessian From c7c22a57dc0e55f2697870766716c50409ff3736 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 15 May 2020 17:39:29 +0000 Subject: [PATCH 044/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 4 ++-- src/treelearner/gpu_tree_learner.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index d87e7addef1..3ea95268032 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -87,7 +87,7 @@ void PrintHistograms(hist_t* h, size_t size) { double total_hess = 0; for (size_t i = 0; i < size; ++i) { printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i)); - if ((i & 2) == 2) + if ((i & 3) == 3) printf("\n"); total_hess += GET_HESS(h, i); } @@ -1098,7 +1098,7 @@ void CUDATreeLearner::FindBestSplits() { if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; } printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd larger leaf:\n", feature_index, bin_size); - PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset1, bin_size); + PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size); } #endif } diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index 7f336b075d4..fad02e1c044 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -52,7 +52,7 @@ void PrintHistograms(hist_t* h, size_t size) { double total_hess = 0; for (size_t i = 0; i < size; ++i) { printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i)); - if ((i & 2) == 2) + if ((i & 3) == 3) printf("\n"); total_hess += GET_HESS(h, i); } @@ -1069,10 +1069,10 @@ void GPUTreeLearner::FindBestSplits(const Tree* tree) { } size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; printf("Feature %d smaller leaf:\n", feature_index); - PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); + PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size); if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; } printf("Feature %d larger leaf:\n", feature_index); - PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size); + PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size); } #endif } From aad98f0ed8ee94e93e1525edbde811674bdd42b9 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Sun, 24 May 2020 12:23:10 +0000 Subject: [PATCH 045/119] Initial CUDA work --- build_LGBM.232.sh | 1 + include/LightGBM/bin.h | 6 + include/LightGBM/feature_group.h | 1 + src/boosting/gbdt.cpp | 24 +++- src/io/dataset.cpp | 32 ++++- src/io/dense_bin.hpp | 38 +++++ src/io/sparse_bin.hpp | 31 +++++ src/treelearner/cuda_tree_learner.cpp | 10 +- .../kernels/histogram_16_64_256.cu | 131 ++++++++++++++++-- src/treelearner/serial_tree_learner.cpp | 42 ++++++ 10 files changed, 297 insertions(+), 19 deletions(-) diff --git a/build_LGBM.232.sh b/build_LGBM.232.sh index 5e500327108..f785d6556e6 100755 --- a/build_LGBM.232.sh +++ b/build_LGBM.232.sh @@ -3,4 +3,5 @@ rm -rf build mkdir build cd build cmake -DUSE_CUDA=1 .. +#cmake .. make -j40 diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index e541e7039e9..c09cde3c809 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -308,6 +308,12 @@ class Bin { * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i] * \param out Output Result */ + + virtual void ConstructHistogramDebug( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + virtual void ConstructHistogram( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index d949beec20e..d5eac42db48 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -176,6 +176,7 @@ class FeatureGroup { inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) { if (!is_multi_val_) { +//fprintf(stderr, "CopySubrow CP1A\n"); fflush(stderr); bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices); } else { for (int i = 0; i < num_feature_; ++i) { diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 55e11312235..f02c5d940f1 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -243,8 +243,10 @@ data_size_t GBDT::BalancedBaggingHelper(data_size_t start, data_size_t cnt, void GBDT::Bagging(int iter) { Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); // if need bagging +fprintf(stderr, "inside GBDT::Bagging!\n"); fflush(stderr); if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || need_re_bagging_) { +//fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr); need_re_bagging_ = false; auto left_cnt = bagging_runner_.Run( num_data_, @@ -263,7 +265,9 @@ void GBDT::Bagging(int iter) { bag_data_cnt_ = left_cnt; Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); // set bagging data to tree learner +//fprintf(stderr, "inside GBDT::Bagging, past second hurdle\n"); fflush(stderr); if (!is_use_subset_) { +//fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr); tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); } else { // LGBM_CUDA // NEW get subset @@ -275,11 +279,21 @@ void GBDT::Bagging(int iter) { tmp_hessians_.resize(total_size); } +//fprintf(stderr, "CopySubrow CP2, bag_data_cnt_ = %d\n", bag_data_cnt_); fflush(stderr); +//char *temp_bag = (char *) bag_data_indices_.data(); +//for (int i=0; iCopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); +//fprintf(stderr, "CopySubrow CP2, calling tree_learner_->ResetTrainingData\n"); fflush(stderr); tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_); +//fprintf(stderr, "CopySubrow CP2, back from tree_learner_->ResetTrainingData\n"); fflush(stderr); } } +fprintf(stderr, "returning from GBDT::Bagging!\n"); fflush(stderr); } void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { @@ -382,11 +396,14 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) { // LGBM_CUDA bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { +//fprintf(stderr, "inside TrainOneIterCUDA CP103\n"); fflush(stderr); + // LGBM_CUDA invoke baggging during the first iteration if ((config_->device_type == std::string("cuda")) && (iter_ == 0)) { // auto start_time = std::chrono::steady_clock::now(); +//fprintf(stderr, "calling Bagging CP104\n"); fflush(stderr); Bagging(0); } @@ -407,8 +424,9 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { hessians = hessians_.data(); } +//fprintf(stderr, "inside TrainOneIterCUDA CP105, bagging commented out\n"); fflush(stderr); // LGBM_CUDA bagging logic - // Bagging(iter_); + // Bagging(iter_); // GCF trial and error bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { @@ -447,8 +465,10 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { // LGBM_CUDA new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_)); } +//fprintf(stderr, "inside TrainOneIterCUDA, num_leaves = %d\n", new_tree->num_leaves()); fflush(stderr); if (new_tree->num_leaves() > 1) { +//fprintf(stderr, "inside TrainOneIterCUDA CP106, this clause doesn't do bagging\n"); fflush(stderr); should_continue = true; auto score_ptr = train_score_updater_->score() + offset; auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast(label[i]) - score_ptr[i]; }; @@ -481,12 +501,14 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { } // LGBM_CUDA: moved for overlapping data copy w/ other operations +//fprintf(stderr, "inside TrainOneIterCUDA CP107\n"); fflush(stderr); int iter_next = iter_ + 1; if (iter_next < config_->num_iterations) { // auto start_time = std::chrono::steady_clock::now(); // bagging logic +//fprintf(stderr, "inside TrainOneIterCUDA CP108\n"); fflush(stderr); Bagging(iter_next); } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index a020f425f3a..b796e1bc2cb 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -801,8 +801,10 @@ void Dataset::CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) { CHECK_EQ(num_used_indices, num_data_); +fprintf(stderr, "CopySubrow CP3, used_indices[5503] = %4d\n", (int) used_indices[5503]); fflush(stderr); OMP_INIT_EX(); #pragma omp parallel for schedule(static) + for (int group = 0; group < num_groups_; ++group) { OMP_LOOP_EX_BEGIN(); feature_groups_[group]->CopySubrow(fullset->feature_groups_[group].get(), @@ -1310,10 +1312,18 @@ void Dataset::ConstructHistogramsInner( data_size_t num_data, const score_t* gradients, const score_t* hessians, score_t* ordered_gradients, score_t* ordered_hessians, TrainingShareStates* share_state, hist_t* hist_data) const { + +fprintf(stderr, "CPU "); +if (!USE_INDICES) fprintf(stderr, "IGNORE_INDICES "); +if (!USE_HESSIAN) fprintf(stderr, "CONST_HESSIAN "); +fprintf(stderr, "\n"); fflush(stderr); +//fprintf(stderr, "gradients[2161] = %lf\n", gradients[2161]); fflush(stderr); + if (!share_state->is_colwise) { return ConstructHistogramsMultiVal( data_indices, num_data, gradients, hessians, share_state, hist_data); } + std::vector used_dense_group; int multi_val_groud_id = -1; used_dense_group.reserve(num_groups_); @@ -1358,12 +1368,20 @@ void Dataset::ConstructHistogramsInner( } } OMP_INIT_EX(); +if (USE_INDICES) { + //fprintf(stderr, " data_indices = %3d %3d %3d %3d %3d %3d %3d %3d\n", data_indices[0], data_indices[1], data_indices[2], data_indices[3], data_indices[4], data_indices[5], data_indices[6], data_indices[7]); fflush(stderr); + //fprintf(stderr, " gradients = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ptr_ordered_grad[0], ptr_ordered_grad[1], ptr_ordered_grad[2], ptr_ordered_grad[3], ptr_ordered_grad[4], ptr_ordered_grad[5], ptr_ordered_grad[6], ptr_ordered_grad[7]); fflush(stderr); + //fprintf(stderr, " hessians = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ptr_ordered_hess[0], ptr_ordered_hess[1], ptr_ordered_hess[2], ptr_ordered_hess[3], ptr_ordered_hess[4], ptr_ordered_hess[5], ptr_ordered_hess[6], ptr_ordered_hess[7]); fflush(stderr); +//fprintf(stderr, " offset into return array for gi = 0: %d\n", (int) group_bin_boundaries_[used_dense_group[0]]); fflush(stderr); +} + #pragma omp parallel for schedule(static) num_threads(share_state->num_threads) for (int gi = 0; gi < num_used_dense_group; ++gi) { OMP_LOOP_EX_BEGIN(); int group = used_dense_group[gi]; auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; const int num_bin = feature_groups_[group]->num_total_bin_; +//fprintf(stderr, "gi = %2d, group_bin_boundaries_[%2d] = %4d, num_bin = %d\n", gi, (int) group, (int) group_bin_boundaries_[group], (int) num_bin); std::memset(reinterpret_cast(data_ptr), 0, num_bin * kHistEntrySize); if (USE_HESSIAN) { @@ -1372,8 +1390,16 @@ void Dataset::ConstructHistogramsInner( data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); } else { - feature_groups_[group]->bin_data_->ConstructHistogram( - 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + if (gi == 0) { +//fprintf(stderr, " calling core ConstructHistogramDebug\n"); fflush(stderr); + feature_groups_[group]->bin_data_->ConstructHistogramDebug( + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); +//fprintf(stderr, " back from ConstructHistogramDebug, hist_data = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", hist_data[0], hist_data[1], hist_data[2], hist_data[3], hist_data[4], hist_data[5]); fflush(stderr); + } + else { + feature_groups_[group]->bin_data_->ConstructHistogram( + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + } } } else { if (USE_INDICES) { @@ -1392,6 +1418,7 @@ void Dataset::ConstructHistogramsInner( } OMP_THROW_EX(); } +//fprintf(stderr, " leaving 'CPU kernel' hist_data = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", hist_data[0], hist_data[1], hist_data[2], hist_data[3], hist_data[4], hist_data[5]); fflush(stderr); global_timer.Stop("Dataset::dense_bin_histogram"); if (multi_val_groud_id >= 0) { if (num_used_dense_group > 0) { @@ -1439,6 +1466,7 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get(); const int most_freq_bin = bin_mapper->GetMostFreqBin(); +//fprintf(stderr, "in Dataset::FixHistogram, feature_idx = %2d, group = %2d, sub_feature = %d, most_freq_bin = %3d\n", feature_idx, group, sub_feature, most_freq_bin); fflush(stderr); if (most_freq_bin > 0) { const int num_bin = bin_mapper->num_bin(); GET_GRAD(data, most_freq_bin) = sum_gradient; diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 99feadf9f7f..803e85a6dab 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -100,6 +100,32 @@ class DenseBin : public Bin { BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; + template + void ConstructHistogramInnerDebug(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const { + data_size_t i = start; + hist_t* grad = out; + hist_t* hess = out + 1; + hist_cnt_t* cnt = reinterpret_cast(hess); +//fprintf(stderr, " inside ConstructHistogramInnerDebug, i = %d\n", i); fflush(stderr); +//fprintf(stderr, " DEBUG: data(5503) = %d\n", data(5503)); + for (; i < end; ++i) { + const auto idx = i; + const auto ti = static_cast(data(idx)) << 1; +//if (ti == 2) fprintf(stderr, " data(%4d) = %4d, adding %7.4lf\n", idx, data(idx), ordered_gradients[i]); fflush(stderr); + if (USE_HESSIAN) { + grad[ti] += ordered_gradients[i]; + hess[ti] += ordered_hessians[i]; + } else { + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; + } + } +//fprintf(stderr, " leaving ConstructHistogramInnerDebug, out[2/3] = %7.4lf %7.4lf\n", out[2], out[3]); + } + template void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, @@ -145,6 +171,16 @@ class DenseBin : public Bin { } } + void ConstructHistogramDebug(data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const { +//fprintf(stderr, " calling ConstructHistogramInnerDebug\n"); fflush(stderr); + ConstructHistogramInnerDebug( + start, end, ordered_gradients, ordered_hessians, out); +//fprintf(stderr, " back from ConstructHistogramInnerDebug\n"); fflush(stderr); + } + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, @@ -388,6 +424,7 @@ class DenseBin : public Bin { const void* memory, const std::vector& local_used_indices) override { const VAL_T* mem_data = reinterpret_cast(memory); +//fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_data_ = %d\n", (int) num_data_); if (!local_used_indices.empty()) { if (IS_4BIT) { const data_size_t rest = num_data_ & 1; @@ -431,6 +468,7 @@ class DenseBin : public Bin { void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { auto other_bin = dynamic_cast*>(full_bin); +//fprintf(stderr, "inside CopySubrow (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_used_indices = %d\n", (int) num_used_indices); if (IS_4BIT) { const data_size_t rest = num_used_indices & 1; for (int i = 0; i < num_used_indices - rest; i += 2) { diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index c56cd6da99d..74cdb08c82b 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -98,6 +98,37 @@ class SparseBin : public Bin { hist[ti] += g; \ hist[ti + 1] += h; + void ConstructHistogramDebug(data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const { + data_size_t i_delta, cur_pos; + InitIndex(start, &i_delta, &cur_pos); + data_size_t i = start; + for (;;) { + if (cur_pos < i) { + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } else if (cur_pos > i) { + if (++i >= end) { + break; + } + } else { + const VAL_T bin = vals_[i_delta]; + ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + if (++i >= end) { + break; + } + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } + } + } + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 3ea95268032..d59c60c3957 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -919,12 +919,14 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; // Check workgroups per feature4 tuple.. - int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); +// GCF Let's try this!!! +// int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU - if (exp_workgroups_per_feature == 0){ - return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); - } +// GCF Let's try this!!! +// if (exp_workgroups_per_feature == 0){ +// return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); +// } // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index a85918cc3c6..42d7c0d4d01 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -208,9 +208,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, score_t grad, hess; score_t grad_next, hess_next; // LGBM_CUDA v5.1 - grad = ordered_gradients[subglobal_tid]; + grad = ordered_gradients[ind]; #if CONST_HESSIAN == 0 - hess = ordered_hessians[subglobal_tid]; + hess = ordered_hessians[ind]; #endif @@ -227,9 +227,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif // imbGBT v5.1 - grad_next = ordered_gradients[i_next]; + grad_next = ordered_gradients[ind_next]; #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[i_next]; + hess_next = ordered_hessians[ind_next]; #endif // STAGE 2: accumulate gradient and hessian @@ -562,9 +562,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, score_t grad, hess; score_t grad_next, hess_next; // LGBM_CUDA v5.1 - grad = ordered_gradients[subglobal_tid]; + grad = ordered_gradients[ind]; #if CONST_HESSIAN == 0 - hess = ordered_hessians[subglobal_tid]; + hess = ordered_hessians[ind]; #endif @@ -581,9 +581,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif // imbGBT v5.1 - grad_next = ordered_gradients[i_next]; + grad_next = ordered_gradients[ind_next]; #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[i_next]; + hess_next = ordered_hessians[ind_next]; #endif // STAGE 2: accumulate gradient and hessian @@ -759,6 +759,9 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { const ushort ltid = threadIdx.x; +//#ifdef IGNORE_INDICES +// const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; +//#endif // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; @@ -772,6 +775,10 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ } ushort i; +//#ifdef IGNORE_INDICES +//if (gtid == 1) printf(" skip_id = %d, grad_bin = %7.4lf\n", skip_id, grad_bin); +//#endif + if (power_feature_workgroups != 0) { // add all sub-histograms for feature const acc_type* __restrict__ p = feature_sub_hist + ltid; @@ -785,6 +792,9 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { +//#ifdef IGNORE_INDICES +//if (gtid == 1) printf(" adding %7.4lf\n", *p); +//#endif grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; cont_bin += as_acc_int_type(*p); p += NUM_BINS; @@ -798,6 +808,13 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ #else output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); #endif + +//#ifdef IGNORE_INDICES +//__syncthreads(); +//if (gtid == 1) printf("KERNEL returning %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", output_buf[0], output_buf[1], output_buf[2], output_buf[3], output_buf[4], output_buf[5]); +//__syncthreads(); +//#endif + } #if USE_CONSTANT_BUF == 1 @@ -842,6 +859,42 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; +if (gtid == 5503) { +#if USE_CONSTANT_BUF == 1 +#ifdef IGNORE_INDICES +#if CONST_HESSIAN == 0 +printf("KERNEL USE_CONSTANT_BUF IGNORE_INDICES\n"); +#else +printf("KERNEL USE_CONSTANT_BUF IGNORE_INDICES CONST_HESSIAN\n"); +#endif +#else +#if CONST_HESSIAN == 0 +printf("KERNEL USE_CONSTANT_BUF \n"); +#else +printf("KERNEL USE_CONSTANT_BUF CONST_HESSIAN\n"); +#endif +#endif +#else +#ifdef IGNORE_INDICES +#if CONST_HESSIAN == 0 +printf("KERNEL IGNORE_INDICES (exp = %d)\n", (int) power_feature_workgroups); +#else +printf("KERNEL IGNORE_INDICES CONST_HESSIAN\n"); +#endif +#else +#if CONST_HESSIAN == 0 +printf("KERNEL (exp = %d)\n", (int) power_feature_workgroups); +//for (int i=0; i<5000; ++i) if (feature_data_base[i] == 1) printf("found '1' in feature_data_base array, at index %d\n", i); +//printf(" data_indices = %3d %3d %3d %3d %3d %3d %3d %3d\n", (int) data_indices[0], (int) data_indices[1], (int) data_indices[2], (int) data_indices[3], (int) data_indices[4], (int) data_indices[5], (int) data_indices[6], (int) data_indices[7]); +//printf(" gradients = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ordered_gradients[data_indices[0]], ordered_gradients[data_indices[1]], ordered_gradients[data_indices[2]], ordered_gradients[data_indices[3]], ordered_gradients[data_indices[4]], ordered_gradients[data_indices[5]], ordered_gradients[data_indices[6]], ordered_gradients[data_indices[7]]); +//printf(" hessians = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ordered_hessians[data_indices[0]], ordered_hessians[data_indices[1]], ordered_hessians[data_indices[2]], ordered_hessians[data_indices[3]], ordered_hessians[data_indices[4]], ordered_hessians[data_indices[5]], ordered_hessians[data_indices[6]], ordered_hessians[data_indices[7]]); +#else +printf("KERNEL CONST_HESSIAN\n"); +#endif +#endif +#endif +} + // local memory per workgroup is 3 KB // clear local memory uint *ptr = (uint *) shared_array; @@ -870,6 +923,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) // feature_size is the number of examples per feature const uchar *feature_data = feature_data_base + feature_id * feature_size; +//#ifdef IGNORE_INDICES +//if (gtid == 5503) printf("feature_id = %d, feature_size = %d\n", feature_id, feature_size); +//#endif // size of threads that process this feature4 const uint subglobal_size = lsize * (1 << power_feature_workgroups); @@ -877,11 +933,16 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // equavalent thread ID in this subgroup for this feature4 const uint subglobal_tid = gtid - feature_id * subglobal_size; +//#ifdef IGNORE_INDICES +//if (gtid == 5503) for (int i=0; i<5600; ++i) if(feature_data_base[i] == 1) printf("found 1 at %d in feature_data_BASE\n", i); +//if (gtid == 5503) for (int i=0; i<5600; ++i) if(feature_data[i] == 1) printf("found 1 at %d in feature_data\n", i); +//#endif data_size_t ind; data_size_t ind_next; #ifdef IGNORE_INDICES ind = subglobal_tid; +//if (gtid == 5503) printf("gtid = %d (0x%08x), subglobal_tid = %d (0x%08x), ind = %d (0x%08x)\n", gtid, gtid, subglobal_tid, subglobal_tid, ind, ind); #else ind = data_indices[subglobal_tid]; #endif @@ -904,6 +965,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ushort bin; feature = feature_data[ind >> feature_mask]; +//#ifdef IGNORE_INDICES +//if (gtid == 5503) printf("gtid = %d (0x%08x), ind = %d (0x%08x), feature_mask = %d, feature = %d\n", gtid, gtid, ind, ind, feature_mask, feature_data[ind >> feature_mask]); +//if (gtid == 5503) printf("gtid = %d (0x%08x), ind = %d (0x%08x), feature_mask = %d, BASE feature = %d\n", gtid, gtid, ind, ind, feature_mask, feature_data_base[ind >> feature_mask]); +//#endif if (feature_mask) { feature = (feature >> ((ind & 1) << 2)) & 0xf; } @@ -915,9 +980,12 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, score_t grad, hess; score_t grad_next, hess_next; // LGBM_CUDA v5.1 - grad = ordered_gradients[subglobal_tid]; +//#ifdef IGNORE_INDICES +//if (gtid == 5503) printf("gtid = %d (0x%08x), gradient by 'i': %lf, gradient by 'subglobal_tid': %lf\n", gtid, gtid, ordered_gradients[gtid], ordered_gradients[subglobal_tid]); +//#endif + grad = ordered_gradients[ind]; #if CONST_HESSIAN == 0 - hess = ordered_hessians[subglobal_tid]; + hess = ordered_hessians[ind]; #endif @@ -929,20 +997,27 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #ifdef IGNORE_INDICES // we need to check to bounds here ind_next = i_next < num_data ? i_next : i; +//if (gtid == 5503) printf("gtid = %d (0x%08x), i = %d (0x%08x), i_next = %d (0x%08x), ind_next = %d (0x%08x)\n", gtid, gtid, i, i, i_next, i_next, ind_next, ind_next); #else ind_next = data_indices[i_next]; #endif // imbGBT v5.1 - grad_next = ordered_gradients[i_next]; + grad_next = ordered_gradients[ind_next]; #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[i_next]; + hess_next = ordered_hessians[ind_next]; #endif +//#ifdef IGNORE_INDICES +//if (gtid == 5503) printf("gtid = %d (0x%08x), i = %d (0x%08x), i_next = %d (0x%08x), grad_next = %lf\n", gtid, gtid, i, i, i_next, i_next, grad_next); +//#endif // STAGE 2: accumulate gradient and hessian if (bin != feature) { addr_bin = gh_hist + bin * 2 + is_hessian_first; #if CONST_HESSIAN == 0 +//#ifdef IGNORE_INDICES +//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), prepping to accumulate: is_hessian_first = %d\n", gtid, gtid, is_hessian_first); +//#endif acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; atomic_local_add_f(addr_bin, acc_bin); @@ -955,39 +1030,71 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif bin = feature; +//#ifdef IGNORE_INDICES +//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), setting bin = feature 1, grad = %lf, grad_bin = %lf\n", gtid, gtid, grad, grad_bin); +//#endif grad_bin = grad; hess_bin = hess; } else { +//#ifdef IGNORE_INDICES +//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), grad = %lf, grad_bin = %lf\n", gtid, gtid, grad, grad_bin); +//#endif grad_bin += grad; hess_bin += hess; } // prefetch the next iteration variables feature_next = feature_data[ind_next >> feature_mask]; +//#ifdef IGNORE_INDICES +//if (gtid == 5503) printf("gtid = %d (0x%08x), ind_next = %d (0x%08x), feature_mask = %d, feature = %d\n", gtid, gtid, ind_next, ind_next, feature_mask, feature_data[ind_next >> feature_mask]); +//if (gtid == 5503) printf("gtid = %d (0x%08x), ind_next = %d (0x%08x), feature_mask = %d, BASE feature = %d\n", gtid, gtid, ind_next, ind_next, feature_mask, feature_data_base[ind_next >> feature_mask]); +//#endif // STAGE 3: accumulate counter atomicAdd(cnt_hist + feature, 1); +//#ifdef IGNORE_INDICES +//if (gtid == 5503 && feature == 1) printf("gtid = %d (0x%08x), adding feature 1 to cnt_hist!\n", gtid, gtid); +//#endif // STAGE 4: update next stat grad = grad_next; +//#ifdef IGNORE_INDICES +//if (gtid == 5503 && feature == 1) printf("gtid = %d (0x%08x), moved grad_next to grad = %lf\n", gtid, gtid, grad); +//#endif hess = hess_next; // LGBM_CUDA: v4.2 if (!feature_mask) { +//#ifdef IGNORE_INDICES +//if (gtid == 5503 && feature_next == 1) printf("gtid = %d (0x%08x), moving feature_next 1 into feature 1!\n", gtid, gtid); +//#endif feature = feature_next; } else { feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; } +//#ifdef IGNORE_INDICES +//if (gtid == 5503) printf("gtid = %d (0x%08x), at end of loop, i = %d, num_data = %d, subglobal_size = %d, feature = %d\n", gtid, gtid, i, num_data, subglobal_size, feature); +//#endif } +//#ifdef IGNORE_INDICES +//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), grad = %lf\n", gtid, gtid, grad); +//#endif addr_bin = gh_hist + bin * 2 + is_hessian_first; #if CONST_HESSIAN == 0 +//#ifdef IGNORE_INDICES +//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), prepping to accumulate: is_hessian_first = %d\n", gtid, gtid, is_hessian_first); +//#endif acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; atomic_local_add_f(addr_bin, acc_bin); addr_bin = addr_bin + 1 - 2 * is_hessian_first; acc_bin = is_hessian_first? grad_bin : hess_bin; + +//#ifdef IGNORE_INDICES +//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), adding %lf to offset %d\n", gtid, gtid, acc_bin, (int) (addr_bin - gh_hist)); +//#endif atomic_local_add_f(addr_bin, acc_bin); #elif CONST_HESSIAN == 1 diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index e5d2a64ceaf..ae7bf52ce30 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -93,6 +93,7 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) { +//fprintf(stderr, "inside SerialTreeLearner::ResetTrainingDataInner\n"); fflush(stderr); train_data_ = train_data; num_data_ = train_data_->num_data(); CHECK_EQ(num_features_, train_data_->num_features()); @@ -152,6 +153,9 @@ void SerialTreeLearner::ResetConfig(const Config* config) { Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) { Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer); + +fprintf(stderr, "in SerialTreeLearner::Train\n"); fflush(stderr); +fprintf(stderr, "first few gradients: %lf %lf %lf %lf\n", (double) gradients[0], (double) gradients[1], (double) gradients[2], (double) gradients[3]); gradients_ = gradients; hessians_ = hessians; is_constant_hessian_ = is_constant_hessian; @@ -181,10 +185,12 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians int init_splits = 0; bool aborted_last_force_split = false; if (!forced_split_json.is_null()) { +//fprintf(stderr, "we're calling ForceSplits\n"); fflush(stderr); init_splits = ForceSplits(tree.get(), forced_split_json, &left_leaf, &right_leaf, &cur_depth, &aborted_last_force_split); } +//fprintf(stderr, "loop start value = %d, loop end value = %d\n", init_splits, config_->num_leaves - 1); fflush(stderr); for (int split = init_splits; split < config_->num_leaves - 1; ++split) { // some initial works before finding best split if (BeforeFindBestSplit(tree_prt, left_leaf, right_leaf)) { @@ -201,10 +207,27 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians break; } // split tree with best leaf + +//fprintf(stderr, "%3d ", best_split_per_leaf_[0].feature); +//fprintf(stderr, "%3d ", best_split_per_leaf_[0].threshold); +//fprintf(stderr, "%3d ", best_split_per_leaf_[0].left_count); +//fprintf(stderr, "%3d ", best_split_per_leaf_[0].right_count); +//fprintf(stderr, "%3d ", best_split_per_leaf_[0].num_cat_threshold); +//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_output); +//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_output); +//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].gain); +//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_sum_gradient); +//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_sum_hessian); +//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_sum_gradient); +//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_sum_hessian); +//fprintf(stderr, "\n"); + +//fprintf(stderr, "Calling Split, best_leaf = %d\n", best_leaf); Split(tree_prt, best_leaf, &left_leaf, &right_leaf); cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf)); } Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth); +fprintf(stderr, "Leaving SerialTreeLearner::Train\n"); fflush(stderr); return tree.release(); } @@ -322,6 +345,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int void SerialTreeLearner::FindBestSplits(const Tree* tree) { std::vector is_feature_used(num_features_, 0); +//fprintf(stderr, "in FindBestSplits, num_features_ = %d\n", num_features_); fflush(stderr); #pragma omp parallel for schedule(static, 256) if (num_features_ >= 512) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue; @@ -333,16 +357,23 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) { is_feature_used[feature_index] = 1; } bool use_subtract = parent_leaf_histogram_array_ != nullptr; +//for (int i=0; iConstructHistograms(smaller)\n"); fflush(stderr); train_data_->ConstructHistograms( is_feature_used, smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), ptr_smaller_leaf_hist_data); +//fprintf(stderr, "back from train_data_->ConstructHistograms(smaller)\n"); fflush(stderr); if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { // construct larger leaf hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset; +//fprintf(stderr, "calling train_data_->ConstructHistograms(larger)\n"); fflush(stderr); train_data_->ConstructHistograms( is_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), ptr_larger_leaf_hist_data); +//fprintf(stderr, "back from train_data_->ConstructHistograms(larger)\n"); fflush(stderr); } } @@ -375,6 +410,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( const std::vector& is_feature_used, bool use_subtract, const Tree* tree) { Common::FunctionTimer fun_timer( "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer); +fflush(stdout); +fflush(stderr); +//fprintf(stderr, "inside FindBestSplitsFromHistograms, num_threads = %d\n", (int) share_state_->num_threads); fflush(stderr); std::vector smaller_best(share_state_->num_threads); std::vector larger_best(share_state_->num_threads); std::vector smaller_node_used_features = col_sampler_.GetByNode(tree, smaller_leaf_splits_->leaf_index()); @@ -438,6 +476,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( auto larger_best_idx = ArrayArgs::ArgMax(larger_best); best_split_per_leaf_[leaf] = larger_best[larger_best_idx]; } +//fprintf(stderr, "leaving FindBestSplitsFromHistograms\n"); fflush(stderr); } int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf, @@ -559,7 +598,9 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf, bool update_cnt) { Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer); + SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; + const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); if (cegb_ != nullptr) { @@ -641,6 +682,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, CHECK(*right_leaf == next_leaf_id); #endif +fprintf(stderr, "arrived at the assert, leaves = %d %d, sum = %d\n", best_split_info.left_count, best_split_info.right_count, best_split_info.left_count + best_split_info.right_count); fflush(stderr); // init the leaves that used on next iteration if (best_split_info.left_count < best_split_info.right_count) { CHECK_GT(best_split_info.left_count, 0); From 1aabb5c989d03544cdd6f3dc3f93dcd449efa4eb Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Sun, 24 May 2020 14:30:53 +0000 Subject: [PATCH 046/119] Initial CUDA work --- src/boosting/gbdt.cpp | 24 ++++++++++++++---------- src/boosting/rf.hpp | 2 ++ src/io/dense_bin.hpp | 4 ++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index f02c5d940f1..821b1b46411 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -243,10 +243,10 @@ data_size_t GBDT::BalancedBaggingHelper(data_size_t start, data_size_t cnt, void GBDT::Bagging(int iter) { Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); // if need bagging -fprintf(stderr, "inside GBDT::Bagging!\n"); fflush(stderr); +fprintf(stderr, "inside GBDT::Bagging, iter = %d\n", iter); fflush(stderr); if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || need_re_bagging_) { -//fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr); +fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr); need_re_bagging_ = false; auto left_cnt = bagging_runner_.Run( num_data_, @@ -265,9 +265,9 @@ fprintf(stderr, "inside GBDT::Bagging!\n"); fflush(stderr); bag_data_cnt_ = left_cnt; Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); // set bagging data to tree learner -//fprintf(stderr, "inside GBDT::Bagging, past second hurdle\n"); fflush(stderr); +fprintf(stderr, "inside GBDT::Bagging, past second hurdle\n"); fflush(stderr); if (!is_use_subset_) { -//fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr); +fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr); tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); } else { // LGBM_CUDA // NEW get subset @@ -279,7 +279,7 @@ fprintf(stderr, "inside GBDT::Bagging!\n"); fflush(stderr); tmp_hessians_.resize(total_size); } -//fprintf(stderr, "CopySubrow CP2, bag_data_cnt_ = %d\n", bag_data_cnt_); fflush(stderr); +fprintf(stderr, "CopySubrow CP2, bag_data_cnt_ = %d\n", bag_data_cnt_); fflush(stderr); //char *temp_bag = (char *) bag_data_indices_.data(); //for (int i=0; idevice_type == std::string("cuda")) && (iter_ == 0)) { + if (config_->device_type == std::string("cuda")) { // auto start_time = std::chrono::steady_clock::now(); -//fprintf(stderr, "calling Bagging CP104\n"); fflush(stderr); - Bagging(0); +fprintf(stderr, "calling Bagging CP104\n"); fflush(stderr); + Bagging(iter_); +fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr); } std::vector init_scores(num_tree_per_iteration_, 0.0); @@ -508,8 +509,9 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { // auto start_time = std::chrono::steady_clock::now(); // bagging logic -//fprintf(stderr, "inside TrainOneIterCUDA CP108\n"); fflush(stderr); +fprintf(stderr, "calling Bagging CP105\n"); fflush(stderr); Bagging(iter_next); +fprintf(stderr, "back from Bagging CP105\n"); fflush(stderr); } } @@ -549,7 +551,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { hessians = hessians_.data(); } // bagging logic +fprintf(stderr, "calling Bagging CP106\n"); fflush(stderr); Bagging(iter_); +fprintf(stderr, "back from Bagging CP106\n"); fflush(stderr); bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index e64bf6cb4d8..6912e0757d6 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -102,7 +102,9 @@ class RF : public GBDT { bool TrainOneIter(const score_t* gradients, const score_t* hessians) override { // bagging logic +fprintf(stderr, "calling Bagging in TrainOneIter\n"); fflush(stderr); Bagging(iter_); +fprintf(stderr, "back from Bagging in TrainOneIter\n"); fflush(stderr); CHECK_EQ(gradients, nullptr); CHECK_EQ(hessians, nullptr); diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 803e85a6dab..c1e30c44bc4 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -424,7 +424,7 @@ class DenseBin : public Bin { const void* memory, const std::vector& local_used_indices) override { const VAL_T* mem_data = reinterpret_cast(memory); -//fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_data_ = %d\n", (int) num_data_); +fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_data_ = %d\n", (int) num_data_); if (!local_used_indices.empty()) { if (IS_4BIT) { const data_size_t rest = num_data_ & 1; @@ -468,7 +468,7 @@ class DenseBin : public Bin { void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { auto other_bin = dynamic_cast*>(full_bin); -//fprintf(stderr, "inside CopySubrow (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_used_indices = %d\n", (int) num_used_indices); +fprintf(stderr, "inside CopySubrow (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_used_indices = %d\n", (int) num_used_indices); if (IS_4BIT) { const data_size_t rest = num_used_indices & 1; for (int i = 0; i < num_used_indices - rest; i += 2) { From f75696ee3205accc41974caad547f66d64448d38 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Thu, 28 May 2020 02:04:58 +0000 Subject: [PATCH 047/119] Initial CUDA work --- src/treelearner/kernels/histogram_16_64_256.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 42d7c0d4d01..4007a26ba47 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -21,7 +21,7 @@ if (b == gtid && t == ltid) { \ } // atomic add for float number in local memory -inline __device__ void atomic_local_add_f(acc_type *addr, const float val) +inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) { atomicAdd(addr, static_cast(val)); } From 0f6df0b7b992d825ccaad5e0d2bd4f6a2cd6b2ba Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 3 Jun 2020 03:41:05 +0000 Subject: [PATCH 048/119] Initial CUDA work --- include/LightGBM/feature_group.h | 1 - src/boosting/gbdt.cpp | 28 +----- src/io/dataset.cpp | 24 +---- src/io/dense_bin.hpp | 8 -- src/treelearner/cuda_tree_learner.cpp | 45 +++------ src/treelearner/cuda_tree_learner.h | 2 +- .../kernels/histogram_16_64_256.cu | 93 ++----------------- src/treelearner/serial_tree_learner.cpp | 47 ++-------- tests/python_package_test/test_consistency.py | 3 +- 9 files changed, 38 insertions(+), 213 deletions(-) diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index d5eac42db48..d949beec20e 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -176,7 +176,6 @@ class FeatureGroup { inline void CopySubrow(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) { if (!is_multi_val_) { -//fprintf(stderr, "CopySubrow CP1A\n"); fflush(stderr); bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices); } else { for (int i = 0; i < num_feature_; ++i) { diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 821b1b46411..3f521ebf333 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -243,10 +243,8 @@ data_size_t GBDT::BalancedBaggingHelper(data_size_t start, data_size_t cnt, void GBDT::Bagging(int iter) { Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); // if need bagging -fprintf(stderr, "inside GBDT::Bagging, iter = %d\n", iter); fflush(stderr); if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || need_re_bagging_) { -fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr); need_re_bagging_ = false; auto left_cnt = bagging_runner_.Run( num_data_, @@ -265,9 +263,7 @@ fprintf(stderr, "inside GBDT::Bagging, past first hurdle\n"); fflush(stderr); bag_data_cnt_ = left_cnt; Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); // set bagging data to tree learner -fprintf(stderr, "inside GBDT::Bagging, past second hurdle\n"); fflush(stderr); if (!is_use_subset_) { -fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr); tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); } else { // LGBM_CUDA // NEW get subset @@ -279,21 +275,11 @@ fprintf(stderr, "inside GBDT::Bagging, calling SetBaggingData\n"); fflush(stderr tmp_hessians_.resize(total_size); } -fprintf(stderr, "CopySubrow CP2, bag_data_cnt_ = %d\n", bag_data_cnt_); fflush(stderr); -//char *temp_bag = (char *) bag_data_indices_.data(); -//for (int i=0; iCopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); -//fprintf(stderr, "CopySubrow CP2, calling tree_learner_->ResetTrainingData\n"); fflush(stderr); tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_); -//fprintf(stderr, "CopySubrow CP2, back from tree_learner_->ResetTrainingData\n"); fflush(stderr); } } -fprintf(stderr, "returning from GBDT::Bagging!\n"); fflush(stderr); } void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { @@ -396,16 +382,12 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) { // LGBM_CUDA bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { -fprintf(stderr, "inside TrainOneIterCUDA, iter_ = %d\n", iter_); fflush(stderr); - // LGBM_CUDA invoke baggging during the first iteration if (config_->device_type == std::string("cuda")) { // auto start_time = std::chrono::steady_clock::now(); -fprintf(stderr, "calling Bagging CP104\n"); fflush(stderr); Bagging(iter_); -fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr); } std::vector init_scores(num_tree_per_iteration_, 0.0); @@ -425,9 +407,8 @@ fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr); hessians = hessians_.data(); } -//fprintf(stderr, "inside TrainOneIterCUDA CP105, bagging commented out\n"); fflush(stderr); // LGBM_CUDA bagging logic - // Bagging(iter_); // GCF trial and error + // Bagging(iter_); bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { @@ -466,10 +447,8 @@ fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr); // LGBM_CUDA new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_)); } -//fprintf(stderr, "inside TrainOneIterCUDA, num_leaves = %d\n", new_tree->num_leaves()); fflush(stderr); if (new_tree->num_leaves() > 1) { -//fprintf(stderr, "inside TrainOneIterCUDA CP106, this clause doesn't do bagging\n"); fflush(stderr); should_continue = true; auto score_ptr = train_score_updater_->score() + offset; auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast(label[i]) - score_ptr[i]; }; @@ -502,16 +481,13 @@ fprintf(stderr, "back from Bagging CP104\n"); fflush(stderr); } // LGBM_CUDA: moved for overlapping data copy w/ other operations -//fprintf(stderr, "inside TrainOneIterCUDA CP107\n"); fflush(stderr); int iter_next = iter_ + 1; if (iter_next < config_->num_iterations) { // auto start_time = std::chrono::steady_clock::now(); // bagging logic -fprintf(stderr, "calling Bagging CP105\n"); fflush(stderr); Bagging(iter_next); -fprintf(stderr, "back from Bagging CP105\n"); fflush(stderr); } } @@ -551,9 +527,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { hessians = hessians_.data(); } // bagging logic -fprintf(stderr, "calling Bagging CP106\n"); fflush(stderr); Bagging(iter_); -fprintf(stderr, "back from Bagging CP106\n"); fflush(stderr); bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index b796e1bc2cb..fc71aeb43cd 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -801,7 +801,6 @@ void Dataset::CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) { CHECK_EQ(num_used_indices, num_data_); -fprintf(stderr, "CopySubrow CP3, used_indices[5503] = %4d\n", (int) used_indices[5503]); fflush(stderr); OMP_INIT_EX(); #pragma omp parallel for schedule(static) @@ -1282,7 +1281,6 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, } OMP_THROW_EX(); global_timer.Stop("Dataset::sparse_bin_histogram"); - global_timer.Start("Dataset::sparse_bin_histogram_merge"); int n_bin_block = 1; int bin_block_size = num_bin; @@ -1313,16 +1311,12 @@ void Dataset::ConstructHistogramsInner( score_t* ordered_gradients, score_t* ordered_hessians, TrainingShareStates* share_state, hist_t* hist_data) const { -fprintf(stderr, "CPU "); -if (!USE_INDICES) fprintf(stderr, "IGNORE_INDICES "); -if (!USE_HESSIAN) fprintf(stderr, "CONST_HESSIAN "); -fprintf(stderr, "\n"); fflush(stderr); -//fprintf(stderr, "gradients[2161] = %lf\n", gradients[2161]); fflush(stderr); - if (!share_state->is_colwise) { +fprintf(stderr, "CPU ('multival') hist_data = %p\n", hist_data); fflush(stderr); return ConstructHistogramsMultiVal( data_indices, num_data, gradients, hessians, share_state, hist_data); } +fprintf(stderr, "CPU (not 'multival')\n"); fflush(stderr); std::vector used_dense_group; int multi_val_groud_id = -1; @@ -1345,6 +1339,7 @@ fprintf(stderr, "\n"); fflush(stderr); } } } + int num_used_dense_group = static_cast(used_dense_group.size()); global_timer.Start("Dataset::dense_bin_histogram"); auto ptr_ordered_grad = gradients; @@ -1367,21 +1362,14 @@ fprintf(stderr, "\n"); fflush(stderr); ptr_ordered_grad = ordered_gradients; } } - OMP_INIT_EX(); -if (USE_INDICES) { - //fprintf(stderr, " data_indices = %3d %3d %3d %3d %3d %3d %3d %3d\n", data_indices[0], data_indices[1], data_indices[2], data_indices[3], data_indices[4], data_indices[5], data_indices[6], data_indices[7]); fflush(stderr); - //fprintf(stderr, " gradients = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ptr_ordered_grad[0], ptr_ordered_grad[1], ptr_ordered_grad[2], ptr_ordered_grad[3], ptr_ordered_grad[4], ptr_ordered_grad[5], ptr_ordered_grad[6], ptr_ordered_grad[7]); fflush(stderr); - //fprintf(stderr, " hessians = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ptr_ordered_hess[0], ptr_ordered_hess[1], ptr_ordered_hess[2], ptr_ordered_hess[3], ptr_ordered_hess[4], ptr_ordered_hess[5], ptr_ordered_hess[6], ptr_ordered_hess[7]); fflush(stderr); -//fprintf(stderr, " offset into return array for gi = 0: %d\n", (int) group_bin_boundaries_[used_dense_group[0]]); fflush(stderr); -} + OMP_INIT_EX(); #pragma omp parallel for schedule(static) num_threads(share_state->num_threads) for (int gi = 0; gi < num_used_dense_group; ++gi) { OMP_LOOP_EX_BEGIN(); int group = used_dense_group[gi]; auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; const int num_bin = feature_groups_[group]->num_total_bin_; -//fprintf(stderr, "gi = %2d, group_bin_boundaries_[%2d] = %4d, num_bin = %d\n", gi, (int) group, (int) group_bin_boundaries_[group], (int) num_bin); std::memset(reinterpret_cast(data_ptr), 0, num_bin * kHistEntrySize); if (USE_HESSIAN) { @@ -1391,10 +1379,8 @@ if (USE_INDICES) { data_ptr); } else { if (gi == 0) { -//fprintf(stderr, " calling core ConstructHistogramDebug\n"); fflush(stderr); feature_groups_[group]->bin_data_->ConstructHistogramDebug( 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); -//fprintf(stderr, " back from ConstructHistogramDebug, hist_data = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", hist_data[0], hist_data[1], hist_data[2], hist_data[3], hist_data[4], hist_data[5]); fflush(stderr); } else { feature_groups_[group]->bin_data_->ConstructHistogram( @@ -1418,7 +1404,6 @@ if (USE_INDICES) { } OMP_THROW_EX(); } -//fprintf(stderr, " leaving 'CPU kernel' hist_data = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", hist_data[0], hist_data[1], hist_data[2], hist_data[3], hist_data[4], hist_data[5]); fflush(stderr); global_timer.Stop("Dataset::dense_bin_histogram"); if (multi_val_groud_id >= 0) { if (num_used_dense_group > 0) { @@ -1466,7 +1451,6 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get(); const int most_freq_bin = bin_mapper->GetMostFreqBin(); -//fprintf(stderr, "in Dataset::FixHistogram, feature_idx = %2d, group = %2d, sub_feature = %d, most_freq_bin = %3d\n", feature_idx, group, sub_feature, most_freq_bin); fflush(stderr); if (most_freq_bin > 0) { const int num_bin = bin_mapper->num_bin(); GET_GRAD(data, most_freq_bin) = sum_gradient; diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index c1e30c44bc4..fc0fe8fbd57 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -109,12 +109,9 @@ class DenseBin : public Bin { hist_t* grad = out; hist_t* hess = out + 1; hist_cnt_t* cnt = reinterpret_cast(hess); -//fprintf(stderr, " inside ConstructHistogramInnerDebug, i = %d\n", i); fflush(stderr); -//fprintf(stderr, " DEBUG: data(5503) = %d\n", data(5503)); for (; i < end; ++i) { const auto idx = i; const auto ti = static_cast(data(idx)) << 1; -//if (ti == 2) fprintf(stderr, " data(%4d) = %4d, adding %7.4lf\n", idx, data(idx), ordered_gradients[i]); fflush(stderr); if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; @@ -123,7 +120,6 @@ class DenseBin : public Bin { ++cnt[ti]; } } -//fprintf(stderr, " leaving ConstructHistogramInnerDebug, out[2/3] = %7.4lf %7.4lf\n", out[2], out[3]); } template @@ -175,10 +171,8 @@ class DenseBin : public Bin { data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const { -//fprintf(stderr, " calling ConstructHistogramInnerDebug\n"); fflush(stderr); ConstructHistogramInnerDebug( start, end, ordered_gradients, ordered_hessians, out); -//fprintf(stderr, " back from ConstructHistogramInnerDebug\n"); fflush(stderr); } void ConstructHistogram(const data_size_t* data_indices, data_size_t start, @@ -424,7 +418,6 @@ class DenseBin : public Bin { const void* memory, const std::vector& local_used_indices) override { const VAL_T* mem_data = reinterpret_cast(memory); -fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_data_ = %d\n", (int) num_data_); if (!local_used_indices.empty()) { if (IS_4BIT) { const data_size_t rest = num_data_ & 1; @@ -468,7 +461,6 @@ fprintf(stderr, "inside LoadFromMemory (in src/io/dense_bin.hpp), updating the F void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { auto other_bin = dynamic_cast*>(full_bin); -fprintf(stderr, "inside CopySubrow (in src/io/dense_bin.hpp), updating the FEATURE DATA, num_used_indices = %d\n", (int) num_used_indices); if (IS_4BIT) { const data_size_t rest = num_used_indices & 1; for (int i = 0; i < num_used_indices - rest; i += 2) { diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index d59c60c3957..b8eca14f9b3 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -76,8 +76,6 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, // Initialize GPU buffers and kernels & LGBM_CUDA: get device info InitGPU(config_->num_gpu); // LGBM_CUDA - - } // some functions used for debugging the GPU histogram construction @@ -238,18 +236,17 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - //CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaMemcpy((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost)); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id])); } } template -void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) { +void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array) { HistType* hist_outputs = (HistType*) host_histogram_outputs_; - //#pragma omp parallel for schedule(static, num_gpu_) + #pragma omp parallel for schedule(static, num_gpu_) for(int device_id = 0; device_id < num_gpu_; ++device_id) { // auto start_time = std::chrono::steady_clock::now(); @@ -265,7 +262,8 @@ void CUDATreeLearner::WaitAndGetHistograms(hist_t* histograms) { continue; } int dense_group_index = dense_feature_group_map_[i]; - auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; + //auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; + auto old_histogram_array = leaf_histogram_array[dense_group_index].RawData() - kHistOffset; int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); for (int j = 0; j < bin_size; ++j) { @@ -644,7 +642,7 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_ int old_num_feature_groups = num_dense_feature_groups_; CountDenseFeatureGroups(); - if ((old_num_data < num_data_) && (old_num_feature_groups < num_dense_feature_groups_)) { + if ((old_num_data < num_data_) || (old_num_feature_groups < num_dense_feature_groups_)) { prevAllocateGPUMemory(); AllocateGPUMemory(); } else { @@ -863,22 +861,13 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( return false; } -#if GPU_DEBUG >= 1 - printf("CudaTreeLearner::ConstructGPUHistogramsAsync() Feature masks: "); - for (unsigned int i = 0; i < feature_masks_.size(); ++i) { - printf("%d ", feature_masks_[i]); - } - printf("\n"); - printf("CudaTreeLearner::ConstructGPUHistogramsAsync() %d feature groups, %d used, %d use_all_features\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features); -#endif - // if not all feature groups are used, we need to transfer the feature mask to GPU // otherwise, we will use a specialized GPU kernel with all feature groups enabled // LGBM_CUDA FIXME: No waiting mark for feature mask // LGBM_CUDA We now copy even if all features are used. - //#pragma omp parallel for schedule(static, num_gpu_) + #pragma omp parallel for schedule(static, num_gpu_) for(int device_id = 0; device_id < num_gpu_; ++device_id) { int offset = offset_gpu_feature_groups_[device_id]; CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id])); @@ -919,14 +908,12 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; // Check workgroups per feature4 tuple.. -// GCF Let's try this!!! -// int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); + int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU -// GCF Let's try this!!! -// if (exp_workgroups_per_feature == 0){ -// return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); -// } + if (exp_workgroups_per_feature == 0){ + return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); + } // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, @@ -954,11 +941,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ if (is_gpu_used) { if (config_->gpu_use_dp) { // use double precision - WaitAndGetHistograms(ptr_smaller_leaf_hist_data); + WaitAndGetHistograms(smaller_leaf_histogram_array_); } else { // use single precision - WaitAndGetHistograms(ptr_smaller_leaf_hist_data); + WaitAndGetHistograms(smaller_leaf_histogram_array_); } } @@ -1072,11 +1059,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ if (is_gpu_used) { if (config_->gpu_use_dp) { // use double precision - WaitAndGetHistograms(ptr_larger_leaf_hist_data); + WaitAndGetHistograms(larger_leaf_histogram_array_); } else { // use single precision - WaitAndGetHistograms(ptr_larger_leaf_hist_data); + WaitAndGetHistograms(larger_leaf_histogram_array_); } } } @@ -1119,8 +1106,6 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); } } else { - smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, best_split_info.right_output); - larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, best_split_info.left_output); if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) || (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) { Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index a84d6b6662f..7b256345c82 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -153,7 +153,7 @@ class CUDATreeLearner: public SerialTreeLearner { * \param histograms Destination of histogram results from GPU. */ template - void WaitAndGetHistograms(hist_t* histograms); + void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array); /*! * \brief Construct GPU histogram asynchronously. diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 4007a26ba47..991444fbf62 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -759,9 +759,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { const ushort ltid = threadIdx.x; -//#ifdef IGNORE_INDICES -// const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; -//#endif + // TODO: try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; @@ -775,10 +773,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ } ushort i; -//#ifdef IGNORE_INDICES -//if (gtid == 1) printf(" skip_id = %d, grad_bin = %7.4lf\n", skip_id, grad_bin); -//#endif - if (power_feature_workgroups != 0) { // add all sub-histograms for feature const acc_type* __restrict__ p = feature_sub_hist + ltid; @@ -792,14 +786,12 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { -//#ifdef IGNORE_INDICES -//if (gtid == 1) printf(" adding %7.4lf\n", *p); -//#endif grad_bin += *p; p += NUM_BINS; hess_bin += *p; p += NUM_BINS; cont_bin += as_acc_int_type(*p); p += NUM_BINS; } } + __syncthreads(); output_buf[ltid * 2 + 0] = grad_bin; @@ -808,13 +800,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ #else output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); #endif - -//#ifdef IGNORE_INDICES -//__syncthreads(); -//if (gtid == 1) printf("KERNEL returning %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", output_buf[0], output_buf[1], output_buf[2], output_buf[3], output_buf[4], output_buf[5]); -//__syncthreads(); -//#endif - } #if USE_CONSTANT_BUF == 1 @@ -859,7 +844,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; -if (gtid == 5503) { +if (gtid == 2048) { #if USE_CONSTANT_BUF == 1 #ifdef IGNORE_INDICES #if CONST_HESSIAN == 0 @@ -877,17 +862,13 @@ printf("KERNEL USE_CONSTANT_BUF CONST_HESSIAN\n"); #else #ifdef IGNORE_INDICES #if CONST_HESSIAN == 0 -printf("KERNEL IGNORE_INDICES (exp = %d)\n", (int) power_feature_workgroups); +printf("KERNEL IGNORE_INDICES (exp = %d) (feature_size = %d)\n", (int) power_feature_workgroups, (int) feature_size); #else printf("KERNEL IGNORE_INDICES CONST_HESSIAN\n"); #endif #else #if CONST_HESSIAN == 0 -printf("KERNEL (exp = %d)\n", (int) power_feature_workgroups); -//for (int i=0; i<5000; ++i) if (feature_data_base[i] == 1) printf("found '1' in feature_data_base array, at index %d\n", i); -//printf(" data_indices = %3d %3d %3d %3d %3d %3d %3d %3d\n", (int) data_indices[0], (int) data_indices[1], (int) data_indices[2], (int) data_indices[3], (int) data_indices[4], (int) data_indices[5], (int) data_indices[6], (int) data_indices[7]); -//printf(" gradients = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ordered_gradients[data_indices[0]], ordered_gradients[data_indices[1]], ordered_gradients[data_indices[2]], ordered_gradients[data_indices[3]], ordered_gradients[data_indices[4]], ordered_gradients[data_indices[5]], ordered_gradients[data_indices[6]], ordered_gradients[data_indices[7]]); -//printf(" hessians = %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf %7.4lf\n", ordered_hessians[data_indices[0]], ordered_hessians[data_indices[1]], ordered_hessians[data_indices[2]], ordered_hessians[data_indices[3]], ordered_hessians[data_indices[4]], ordered_hessians[data_indices[5]], ordered_hessians[data_indices[6]], ordered_hessians[data_indices[7]]); +printf("KERNEL (exp = %d) (feature_size = %d)\n", (int) power_feature_workgroups, (int) feature_size); #else printf("KERNEL CONST_HESSIAN\n"); #endif @@ -923,9 +904,6 @@ printf("KERNEL CONST_HESSIAN\n"); // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) // feature_size is the number of examples per feature const uchar *feature_data = feature_data_base + feature_id * feature_size; -//#ifdef IGNORE_INDICES -//if (gtid == 5503) printf("feature_id = %d, feature_size = %d\n", feature_id, feature_size); -//#endif // size of threads that process this feature4 const uint subglobal_size = lsize * (1 << power_feature_workgroups); @@ -933,16 +911,10 @@ printf("KERNEL CONST_HESSIAN\n"); // equavalent thread ID in this subgroup for this feature4 const uint subglobal_tid = gtid - feature_id * subglobal_size; -//#ifdef IGNORE_INDICES -//if (gtid == 5503) for (int i=0; i<5600; ++i) if(feature_data_base[i] == 1) printf("found 1 at %d in feature_data_BASE\n", i); -//if (gtid == 5503) for (int i=0; i<5600; ++i) if(feature_data[i] == 1) printf("found 1 at %d in feature_data\n", i); -//#endif - data_size_t ind; data_size_t ind_next; #ifdef IGNORE_INDICES ind = subglobal_tid; -//if (gtid == 5503) printf("gtid = %d (0x%08x), subglobal_tid = %d (0x%08x), ind = %d (0x%08x)\n", gtid, gtid, subglobal_tid, subglobal_tid, ind, ind); #else ind = data_indices[subglobal_tid]; #endif @@ -965,11 +937,7 @@ printf("KERNEL CONST_HESSIAN\n"); ushort bin; feature = feature_data[ind >> feature_mask]; -//#ifdef IGNORE_INDICES -//if (gtid == 5503) printf("gtid = %d (0x%08x), ind = %d (0x%08x), feature_mask = %d, feature = %d\n", gtid, gtid, ind, ind, feature_mask, feature_data[ind >> feature_mask]); -//if (gtid == 5503) printf("gtid = %d (0x%08x), ind = %d (0x%08x), feature_mask = %d, BASE feature = %d\n", gtid, gtid, ind, ind, feature_mask, feature_data_base[ind >> feature_mask]); -//#endif - if (feature_mask) { + if (feature_mask) { feature = (feature >> ((ind & 1) << 2)) & 0xf; } bin = feature; @@ -980,15 +948,11 @@ printf("KERNEL CONST_HESSIAN\n"); score_t grad, hess; score_t grad_next, hess_next; // LGBM_CUDA v5.1 -//#ifdef IGNORE_INDICES -//if (gtid == 5503) printf("gtid = %d (0x%08x), gradient by 'i': %lf, gradient by 'subglobal_tid': %lf\n", gtid, gtid, ordered_gradients[gtid], ordered_gradients[subglobal_tid]); -//#endif grad = ordered_gradients[ind]; #if CONST_HESSIAN == 0 hess = ordered_hessians[ind]; #endif - // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { // prefetch the next iteration variables @@ -997,7 +961,6 @@ printf("KERNEL CONST_HESSIAN\n"); #ifdef IGNORE_INDICES // we need to check to bounds here ind_next = i_next < num_data ? i_next : i; -//if (gtid == 5503) printf("gtid = %d (0x%08x), i = %d (0x%08x), i_next = %d (0x%08x), ind_next = %d (0x%08x)\n", gtid, gtid, i, i, i_next, i_next, ind_next, ind_next); #else ind_next = data_indices[i_next]; #endif @@ -1007,17 +970,10 @@ printf("KERNEL CONST_HESSIAN\n"); #if CONST_HESSIAN == 0 hess_next = ordered_hessians[ind_next]; #endif -//#ifdef IGNORE_INDICES -//if (gtid == 5503) printf("gtid = %d (0x%08x), i = %d (0x%08x), i_next = %d (0x%08x), grad_next = %lf\n", gtid, gtid, i, i, i_next, i_next, grad_next); -//#endif - // STAGE 2: accumulate gradient and hessian if (bin != feature) { addr_bin = gh_hist + bin * 2 + is_hessian_first; #if CONST_HESSIAN == 0 -//#ifdef IGNORE_INDICES -//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), prepping to accumulate: is_hessian_first = %d\n", gtid, gtid, is_hessian_first); -//#endif acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; atomic_local_add_f(addr_bin, acc_bin); @@ -1030,71 +986,42 @@ printf("KERNEL CONST_HESSIAN\n"); #endif bin = feature; -//#ifdef IGNORE_INDICES -//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), setting bin = feature 1, grad = %lf, grad_bin = %lf\n", gtid, gtid, grad, grad_bin); -//#endif + grad_bin = grad; hess_bin = hess; } else { -//#ifdef IGNORE_INDICES -//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), grad = %lf, grad_bin = %lf\n", gtid, gtid, grad, grad_bin); -//#endif + + grad_bin += grad; hess_bin += hess; } // prefetch the next iteration variables feature_next = feature_data[ind_next >> feature_mask]; -//#ifdef IGNORE_INDICES -//if (gtid == 5503) printf("gtid = %d (0x%08x), ind_next = %d (0x%08x), feature_mask = %d, feature = %d\n", gtid, gtid, ind_next, ind_next, feature_mask, feature_data[ind_next >> feature_mask]); -//if (gtid == 5503) printf("gtid = %d (0x%08x), ind_next = %d (0x%08x), feature_mask = %d, BASE feature = %d\n", gtid, gtid, ind_next, ind_next, feature_mask, feature_data_base[ind_next >> feature_mask]); -//#endif // STAGE 3: accumulate counter atomicAdd(cnt_hist + feature, 1); -//#ifdef IGNORE_INDICES -//if (gtid == 5503 && feature == 1) printf("gtid = %d (0x%08x), adding feature 1 to cnt_hist!\n", gtid, gtid); -//#endif // STAGE 4: update next stat grad = grad_next; -//#ifdef IGNORE_INDICES -//if (gtid == 5503 && feature == 1) printf("gtid = %d (0x%08x), moved grad_next to grad = %lf\n", gtid, gtid, grad); -//#endif hess = hess_next; // LGBM_CUDA: v4.2 if (!feature_mask) { -//#ifdef IGNORE_INDICES -//if (gtid == 5503 && feature_next == 1) printf("gtid = %d (0x%08x), moving feature_next 1 into feature 1!\n", gtid, gtid); -//#endif feature = feature_next; } else { feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; } -//#ifdef IGNORE_INDICES -//if (gtid == 5503) printf("gtid = %d (0x%08x), at end of loop, i = %d, num_data = %d, subglobal_size = %d, feature = %d\n", gtid, gtid, i, num_data, subglobal_size, feature); -//#endif } -//#ifdef IGNORE_INDICES -//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), grad = %lf\n", gtid, gtid, grad); -//#endif - addr_bin = gh_hist + bin * 2 + is_hessian_first; #if CONST_HESSIAN == 0 -//#ifdef IGNORE_INDICES -//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), prepping to accumulate: is_hessian_first = %d\n", gtid, gtid, is_hessian_first); -//#endif acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; atomic_local_add_f(addr_bin, acc_bin); addr_bin = addr_bin + 1 - 2 * is_hessian_first; acc_bin = is_hessian_first? grad_bin : hess_bin; -//#ifdef IGNORE_INDICES -//if (gtid == 5503 && bin == 1) printf("gtid = %d (0x%08x), adding %lf to offset %d\n", gtid, gtid, acc_bin, (int) (addr_bin - gh_hist)); -//#endif atomic_local_add_f(addr_bin, acc_bin); #elif CONST_HESSIAN == 1 @@ -1183,8 +1110,6 @@ printf("KERNEL CONST_HESSIAN\n"); uint skip_id = group_id - output_offset; // locate output histogram location for this feature4 acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - - within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); } } diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index ae7bf52ce30..230452d7c78 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -93,7 +93,6 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) { -//fprintf(stderr, "inside SerialTreeLearner::ResetTrainingDataInner\n"); fflush(stderr); train_data_ = train_data; num_data_ = train_data_->num_data(); CHECK_EQ(num_features_, train_data_->num_features()); @@ -153,9 +152,6 @@ void SerialTreeLearner::ResetConfig(const Config* config) { Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) { Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer); - -fprintf(stderr, "in SerialTreeLearner::Train\n"); fflush(stderr); -fprintf(stderr, "first few gradients: %lf %lf %lf %lf\n", (double) gradients[0], (double) gradients[1], (double) gradients[2], (double) gradients[3]); gradients_ = gradients; hessians_ = hessians; is_constant_hessian_ = is_constant_hessian; @@ -185,14 +181,13 @@ fprintf(stderr, "first few gradients: %lf %lf %lf %lf\n", (double) gradients[0], int init_splits = 0; bool aborted_last_force_split = false; if (!forced_split_json.is_null()) { -//fprintf(stderr, "we're calling ForceSplits\n"); fflush(stderr); init_splits = ForceSplits(tree.get(), forced_split_json, &left_leaf, &right_leaf, &cur_depth, &aborted_last_force_split); } -//fprintf(stderr, "loop start value = %d, loop end value = %d\n", init_splits, config_->num_leaves - 1); fflush(stderr); for (int split = init_splits; split < config_->num_leaves - 1; ++split) { // some initial works before finding best split + if (BeforeFindBestSplit(tree_prt, left_leaf, right_leaf)) { // find best threshold for every feature FindBestSplits(tree_prt); @@ -207,27 +202,10 @@ fprintf(stderr, "first few gradients: %lf %lf %lf %lf\n", (double) gradients[0], break; } // split tree with best leaf - -//fprintf(stderr, "%3d ", best_split_per_leaf_[0].feature); -//fprintf(stderr, "%3d ", best_split_per_leaf_[0].threshold); -//fprintf(stderr, "%3d ", best_split_per_leaf_[0].left_count); -//fprintf(stderr, "%3d ", best_split_per_leaf_[0].right_count); -//fprintf(stderr, "%3d ", best_split_per_leaf_[0].num_cat_threshold); -//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_output); -//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_output); -//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].gain); -//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_sum_gradient); -//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].left_sum_hessian); -//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_sum_gradient); -//fprintf(stderr, "%8.5lf ", best_split_per_leaf_[0].right_sum_hessian); -//fprintf(stderr, "\n"); - -//fprintf(stderr, "Calling Split, best_leaf = %d\n", best_leaf); Split(tree_prt, best_leaf, &left_leaf, &right_leaf); cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf)); } Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth); -fprintf(stderr, "Leaving SerialTreeLearner::Train\n"); fflush(stderr); return tree.release(); } @@ -345,7 +323,6 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int void SerialTreeLearner::FindBestSplits(const Tree* tree) { std::vector is_feature_used(num_features_, 0); -//fprintf(stderr, "in FindBestSplits, num_features_ = %d\n", num_features_); fflush(stderr); #pragma omp parallel for schedule(static, 256) if (num_features_ >= 512) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue; @@ -357,23 +334,16 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) { is_feature_used[feature_index] = 1; } bool use_subtract = parent_leaf_histogram_array_ != nullptr; -//for (int i=0; iConstructHistograms(smaller)\n"); fflush(stderr); train_data_->ConstructHistograms( is_feature_used, smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), ptr_smaller_leaf_hist_data); -//fprintf(stderr, "back from train_data_->ConstructHistograms(smaller)\n"); fflush(stderr); if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { // construct larger leaf hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset; -//fprintf(stderr, "calling train_data_->ConstructHistograms(larger)\n"); fflush(stderr); train_data_->ConstructHistograms( is_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), ptr_larger_leaf_hist_data); -//fprintf(stderr, "back from train_data_->ConstructHistograms(larger)\n"); fflush(stderr); } } @@ -410,9 +376,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( const std::vector& is_feature_used, bool use_subtract, const Tree* tree) { Common::FunctionTimer fun_timer( "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer); -fflush(stdout); -fflush(stderr); -//fprintf(stderr, "inside FindBestSplitsFromHistograms, num_threads = %d\n", (int) share_state_->num_threads); fflush(stderr); std::vector smaller_best(share_state_->num_threads); std::vector larger_best(share_state_->num_threads); std::vector smaller_node_used_features = col_sampler_.GetByNode(tree, smaller_leaf_splits_->leaf_index()); @@ -429,6 +392,7 @@ fflush(stderr); continue; } const int tid = omp_get_thread_num(); + train_data_->FixHistogram( feature_index, smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(), @@ -462,12 +426,12 @@ fflush(stderr); larger_node_used_features[feature_index], larger_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_.get(), &larger_best[tid]); - OMP_LOOP_EX_END(); } OMP_THROW_EX(); auto smaller_best_idx = ArrayArgs::ArgMax(smaller_best); int leaf = smaller_leaf_splits_->leaf_index(); + best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx]; if (larger_leaf_splits_ != nullptr && @@ -476,7 +440,6 @@ fflush(stderr); auto larger_best_idx = ArrayArgs::ArgMax(larger_best); best_split_per_leaf_[leaf] = larger_best[larger_best_idx]; } -//fprintf(stderr, "leaving FindBestSplitsFromHistograms\n"); fflush(stderr); } int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf, @@ -682,7 +645,6 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, CHECK(*right_leaf == next_leaf_id); #endif -fprintf(stderr, "arrived at the assert, leaves = %d %d, sum = %d\n", best_split_info.left_count, best_split_info.right_count, best_split_info.left_count + best_split_info.right_count); fflush(stderr); // init the leaves that used on next iteration if (best_split_info.left_count < best_split_info.right_count) { CHECK_GT(best_split_info.left_count, 0); @@ -760,6 +722,7 @@ void SerialTreeLearner::ComputeBestSplitForFeature( FeatureHistogram* histogram_array_, int feature_index, int real_fidx, bool is_feature_used, int num_data, const LeafSplits* leaf_splits, SplitInfo* best_split) { + if (!is_feature_used) { return; } @@ -774,9 +737,11 @@ void SerialTreeLearner::ComputeBestSplitForFeature( } else { parent_output = leaf_splits->weight(); } + histogram_array_[feature_index].FindBestThreshold( leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, constraints_->Get(leaf_splits->leaf_index()), parent_output, &new_split); + new_split.feature = real_fidx; if (cegb_ != nullptr) { new_split.gain -= diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py index f6e955ee48d..0e3fc509144 100644 --- a/tests/python_package_test/test_consistency.py +++ b/tests/python_package_test/test_consistency.py @@ -39,7 +39,8 @@ def load_cpp_result(self, result_file='LightGBM_predict_result.txt'): def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred): params = dict(self.params) - params['force_row_wise'] = True + # KNOWN BUG (the CUDA kernel cannot handle "row wise", so we disable it in this test) + # params['force_row_wise'] = True gbm = lgb.train(params, lgb_train) y_pred = gbm.predict(X_test) cpp_pred = gbm.predict(X_test_fn) From 8fa83181d2021431821b584d494af26b4f79ec2d Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 3 Jun 2020 15:26:27 +0000 Subject: [PATCH 049/119] Initial CUDA work --- src/boosting/rf.hpp | 2 -- src/io/dataset.cpp | 4 ++-- src/treelearner/feature_histogram.hpp | 4 ++-- src/treelearner/kernels/histogram_16_64_256.cu | 3 --- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 6912e0757d6..e64bf6cb4d8 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -102,9 +102,7 @@ class RF : public GBDT { bool TrainOneIter(const score_t* gradients, const score_t* hessians) override { // bagging logic -fprintf(stderr, "calling Bagging in TrainOneIter\n"); fflush(stderr); Bagging(iter_); -fprintf(stderr, "back from Bagging in TrainOneIter\n"); fflush(stderr); CHECK_EQ(gradients, nullptr); CHECK_EQ(hessians, nullptr); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index fc71aeb43cd..ccdf0b21576 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1312,11 +1312,11 @@ void Dataset::ConstructHistogramsInner( TrainingShareStates* share_state, hist_t* hist_data) const { if (!share_state->is_colwise) { -fprintf(stderr, "CPU ('multival') hist_data = %p\n", hist_data); fflush(stderr); +//fprintf(stderr, "CPU ('multival') hist_data = %p\n", hist_data); fflush(stderr); return ConstructHistogramsMultiVal( data_indices, num_data, gradients, hessians, share_state, hist_data); } -fprintf(stderr, "CPU (not 'multival')\n"); fflush(stderr); +//fprintf(stderr, "CPU (not 'multival')\n"); fflush(stderr); std::vector used_dense_group; int multi_val_groud_id = -1; diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index bf3d81c53d8..c7371d6a31a 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -1201,8 +1201,8 @@ class HistogramPool { for (int i = old_cache_size; i < cache_size; ++i) { OMP_LOOP_EX_BEGIN(); pool_[i].reset(new FeatureHistogram[train_data->num_features()]); - //data_[i].resize(num_total_bin * 2); - data_[i].resize(num_total_bin * 3); // GCF HACK to avoid mysterious core dumps + data_[i].resize(num_total_bin * 2); + //data_[i].resize(num_total_bin * 3); // GCF HACK to avoid mysterious core dumps for (int j = 0; j < train_data->num_features(); ++j) { pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]); } diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 991444fbf62..8828e55ac11 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -986,13 +986,10 @@ printf("KERNEL CONST_HESSIAN\n"); #endif bin = feature; - grad_bin = grad; hess_bin = hess; } else { - - grad_bin += grad; hess_bin += hess; } From f70beb7470869ff4fb5498f0b9c99cb236e555b8 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 3 Jun 2020 17:37:05 +0000 Subject: [PATCH 050/119] Initial CUDA work --- src/io/dataset.cpp | 4 ++++ tests/python_package_test/test_consistency.py | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index ccdf0b21576..820e34e5856 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -627,6 +627,10 @@ TrainingShareStates* Dataset::GetShareStates( share_state->is_constant_hessian = is_constant_hessian; return share_state; } +#ifdef USE_CUDA + force_colwise = 1; + force_rowwise = 0; +#endif if (force_colwise) { TrainingShareStates* share_state = new TrainingShareStates(); share_state->SetMultiValBin(GetMultiBinFromSparseFeatures()); diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py index 0e3fc509144..f6e955ee48d 100644 --- a/tests/python_package_test/test_consistency.py +++ b/tests/python_package_test/test_consistency.py @@ -39,8 +39,7 @@ def load_cpp_result(self, result_file='LightGBM_predict_result.txt'): def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred): params = dict(self.params) - # KNOWN BUG (the CUDA kernel cannot handle "row wise", so we disable it in this test) - # params['force_row_wise'] = True + params['force_row_wise'] = True gbm = lgb.train(params, lgb_train) y_pred = gbm.predict(X_test) cpp_pred = gbm.predict(X_test_fn) From af49c3202e8e89e12b61fd34b8113898e045a571 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 4 Jun 2020 14:44:15 +0000 Subject: [PATCH 051/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index b8eca14f9b3..78182ac2a90 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -206,7 +206,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu if (num_workgroups > preallocd_max_num_wg_[device_id]) { preallocd_max_num_wg_.at(device_id) = num_workgroups; CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id])); - cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))); } //set thread_data SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features, @@ -396,7 +396,7 @@ void CUDATreeLearner::AllocateGPUMemory() { // copy indices to the device - if (device_feature_masks_[device_id] != NULL){ + if (device_data_indices_[device_id] != NULL){ CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); } @@ -453,6 +453,7 @@ void CUDATreeLearner::copyDenseFeature() { // set device info int device_id = 0; uint8_t* device_features = device_features_[device_id]; + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); Log::Debug("Started copying dense features from CPU to GPU - 1"); for (int i = 0; i < num_feature_groups_; ++i) { @@ -499,6 +500,9 @@ void CUDATreeLearner::InitGPU(int num_gpu) { printf("bin_size: "); #endif for (int i = 0; i < num_feature_groups_; ++i) { + if (train_data_->IsMultiGroup(i)) { + continue; + } #if GPU_DEBUG >= 1 printf("%d, ", train_data_->FeatureGroupNumBin(i)); #endif From 038128ddde974ca1ff8eee1f2c62f82db5b25363 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Thu, 4 Jun 2020 17:39:30 +0000 Subject: [PATCH 052/119] Initial CUDA work --- include/LightGBM/bin.h | 5 --- src/io/dataset.cpp | 10 ++---- src/io/dense_bin.hpp | 30 ------------------ src/io/sparse_bin.hpp | 31 ------------------- .../kernels/histogram_16_64_256.cu | 4 ++- src/treelearner/serial_tree_learner.cpp | 1 - 6 files changed, 5 insertions(+), 76 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index c09cde3c809..96ae6a8d641 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -309,11 +309,6 @@ class Bin { * \param out Output Result */ - virtual void ConstructHistogramDebug( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const = 0; - virtual void ConstructHistogram( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 820e34e5856..0f7e0401285 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1382,14 +1382,8 @@ void Dataset::ConstructHistogramsInner( data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); } else { - if (gi == 0) { - feature_groups_[group]->bin_data_->ConstructHistogramDebug( - 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); - } - else { - feature_groups_[group]->bin_data_->ConstructHistogram( - 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); - } + feature_groups_[group]->bin_data_->ConstructHistogram( + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); } } else { if (USE_INDICES) { diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index fc0fe8fbd57..99feadf9f7f 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -100,28 +100,6 @@ class DenseBin : public Bin { BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; - template - void ConstructHistogramInnerDebug(data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const { - data_size_t i = start; - hist_t* grad = out; - hist_t* hess = out + 1; - hist_cnt_t* cnt = reinterpret_cast(hess); - for (; i < end; ++i) { - const auto idx = i; - const auto ti = static_cast(data(idx)) << 1; - if (USE_HESSIAN) { - grad[ti] += ordered_gradients[i]; - hess[ti] += ordered_hessians[i]; - } else { - grad[ti] += ordered_gradients[i]; - ++cnt[ti]; - } - } - } - template void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, @@ -167,14 +145,6 @@ class DenseBin : public Bin { } } - void ConstructHistogramDebug(data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const { - ConstructHistogramInnerDebug( - start, end, ordered_gradients, ordered_hessians, out); - } - void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index 74cdb08c82b..c56cd6da99d 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -98,37 +98,6 @@ class SparseBin : public Bin { hist[ti] += g; \ hist[ti + 1] += h; - void ConstructHistogramDebug(data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const { - data_size_t i_delta, cur_pos; - InitIndex(start, &i_delta, &cur_pos); - data_size_t i = start; - for (;;) { - if (cur_pos < i) { - cur_pos += deltas_[++i_delta]; - if (i_delta >= num_vals_) { - break; - } - } else if (cur_pos > i) { - if (++i >= end) { - break; - } - } else { - const VAL_T bin = vals_[i_delta]; - ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); - if (++i >= end) { - break; - } - cur_pos += deltas_[++i_delta]; - if (i_delta >= num_vals_) { - break; - } - } - } - } - void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 8828e55ac11..7002ac71ded 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -844,7 +844,8 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; -if (gtid == 2048) { +#if 0 +if (gtid == 0) { #if USE_CONSTANT_BUF == 1 #ifdef IGNORE_INDICES #if CONST_HESSIAN == 0 @@ -875,6 +876,7 @@ printf("KERNEL CONST_HESSIAN\n"); #endif #endif } +#endif // local memory per workgroup is 3 KB // clear local memory diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 230452d7c78..e5b6626a6bd 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -337,7 +337,6 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) { #ifdef USE_CUDA if (LGBM_config_::current_learner == use_cpu_learner){ - Log::Info("LightGBM-CUDA using CPU ConstructHistograms()"); SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); } else{ From 3fd7618d7aca3381582fe9b95e03b989e308e835 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Thu, 4 Jun 2020 20:43:02 +0000 Subject: [PATCH 053/119] Initial CUDA work --- include/LightGBM/config.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 9622814832b..562ae79d388 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -955,8 +955,8 @@ struct Config { bool gpu_use_dp = false; // desc = number of gpus (CUDA implementation only) LGBM_CUDA - // desc = default value is 1 - int num_gpu = 1; + // desc = default value is 4 + int num_gpu = 4; #pragma endregion From 7e692c24b8cb50f1030e7d450aff16a336202eaf Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Thu, 4 Jun 2020 21:07:26 +0000 Subject: [PATCH 054/119] Initial CUDA work --- include/LightGBM/config.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 562ae79d388..51e4ea44c32 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -956,7 +956,11 @@ struct Config { // desc = number of gpus (CUDA implementation only) LGBM_CUDA // desc = default value is 4 +#ifdef USE_CUDA int num_gpu = 4; +#else + int num_gpu = 1; +#endif #pragma endregion From b27b7e1fa4f6f2583f3620604455bd354fe306f7 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Thu, 4 Jun 2020 22:47:54 +0000 Subject: [PATCH 055/119] Initial CUDA work --- include/LightGBM/config.h | 4 +--- src/io/config_auto.cpp | 4 ++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 51e4ea44c32..c7be0e89884 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -954,12 +954,10 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) bool gpu_use_dp = false; +#ifdef USE_CUDA // desc = number of gpus (CUDA implementation only) LGBM_CUDA // desc = default value is 4 -#ifdef USE_CUDA int num_gpu = 4; -#else - int num_gpu = 1; #endif #pragma endregion diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 46d95b0df8f..9408a97c70f 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -294,7 +294,9 @@ const std::unordered_set& Config::parameter_set() { "gpu_platform_id", "gpu_device_id", "gpu_use_dp", +#ifdef USE_CUDA "num_gpu", /* LGBM_CUDA */ +#endif }); return params; } @@ -613,9 +615,11 @@ void Config::GetMembersFromString(const std::unordered_map 0); +#endif } From 80a8f43e18ee0bb1643f40141f11d610aa61f3de Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 8 Jun 2020 14:30:32 +0000 Subject: [PATCH 056/119] Initial CUDA work --- include/LightGBM/config.h | 4 ++-- src/io/dataset.cpp | 2 -- src/treelearner/cuda_tree_learner.cpp | 11 ++--------- src/treelearner/feature_histogram.hpp | 1 - 4 files changed, 4 insertions(+), 14 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index c7be0e89884..162c7583dc7 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -956,8 +956,8 @@ struct Config { #ifdef USE_CUDA // desc = number of gpus (CUDA implementation only) LGBM_CUDA - // desc = default value is 4 - int num_gpu = 4; + // desc = default value is 1 + int num_gpu = 1; #endif #pragma endregion diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 0f7e0401285..c96a83516bf 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1316,11 +1316,9 @@ void Dataset::ConstructHistogramsInner( TrainingShareStates* share_state, hist_t* hist_data) const { if (!share_state->is_colwise) { -//fprintf(stderr, "CPU ('multival') hist_data = %p\n", hist_data); fflush(stderr); return ConstructHistogramsMultiVal( data_indices, num_data, gradients, hessians, share_state, hist_data); } -//fprintf(stderr, "CPU (not 'multival')\n"); fflush(stderr); std::vector used_dense_group; int multi_val_groud_id = -1; diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 78182ac2a90..f488e5c4a75 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -919,26 +919,19 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); } - // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU + // ConstructGPUHistogramsAsync will return true if there are availabe feature groups dispatched to GPU bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, nullptr, smaller_leaf_splits_->num_data_in_leaf()); // then construct sparse features on CPU // We set data_indices to null to avoid rebuilding ordered gradients/hessians if (num_sparse_features > 0){ -// train_data_->ConstructHistograms(is_sparse_feature_used, -// nullptr, smaller_leaf_splits_->num_data_in_leaf(), -// smaller_leaf_splits_->leaf_index(), -// ordered_bins_, gradients_, hessians_, -// ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, -// ptr_smaller_leaf_hist_data); - train_data_->ConstructHistograms(is_sparse_feature_used, + train_data_->ConstructHistograms(is_sparse_feature_used, smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), ptr_smaller_leaf_hist_data); - } // wait for GPU to finish, only if GPU is actually used diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index c7371d6a31a..8916ee48fd4 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -1202,7 +1202,6 @@ class HistogramPool { OMP_LOOP_EX_BEGIN(); pool_[i].reset(new FeatureHistogram[train_data->num_features()]); data_[i].resize(num_total_bin * 2); - //data_[i].resize(num_total_bin * 3); // GCF HACK to avoid mysterious core dumps for (int j = 0; j < train_data->num_features(); ++j) { pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]); } From baf6f792495e68cdb87945392f7a3dd4cbcfaf8a Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 8 Jun 2020 15:32:49 +0000 Subject: [PATCH 057/119] Initial CUDA work --- src/boosting/gbdt.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 3f521ebf333..24264c3c175 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -382,8 +382,8 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) { // LGBM_CUDA bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { - // LGBM_CUDA invoke baggging during the first iteration - if (config_->device_type == std::string("cuda")) { + // LGBM_CUDA invoke bagging during the first iteration + if (config_->device_type == std::string("cuda") && (iter_ == 0)) { // auto start_time = std::chrono::steady_clock::now(); From 944a3e57a93c56f9e3c5c243e5e392f4092059b4 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 8 Jun 2020 16:33:30 +0000 Subject: [PATCH 058/119] Initial CUDA work --- src/treelearner/cuda_tree_learner.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index f488e5c4a75..f45319ae818 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -297,7 +297,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { // leave some safe margin for prefetching // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature - allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature); + allocated_num_data_ = std::max(num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature), allocated_num_data_); // clear sparse/dense maps @@ -594,6 +594,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { CUDASUCCESS_OR_FATAL(cudaEventCreate(&(histograms_wait_obj_[i]))); } + allocated_num_data_ = 0; prevAllocateGPUMemory(); AllocateGPUMemory(); @@ -626,7 +627,7 @@ Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) { // LGBM_CUDA: check data size - data_size_t old_num_data = num_data_; + data_size_t old_allocated_num_data = allocated_num_data_; SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin); @@ -646,7 +647,7 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_ int old_num_feature_groups = num_dense_feature_groups_; CountDenseFeatureGroups(); - if ((old_num_data < num_data_) || (old_num_feature_groups < num_dense_feature_groups_)) { + if ((old_allocated_num_data < (num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature))) || (old_num_feature_groups < num_dense_feature_groups_)) { prevAllocateGPUMemory(); AllocateGPUMemory(); } else { From f34ec350b2d7a39fc56ea0dc23e7707e50be37e4 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 8 Jun 2020 17:13:25 +0000 Subject: [PATCH 059/119] Initial CUDA work --- src/io/config.cpp | 10 ++++++++++ src/io/dataset.cpp | 4 ---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/io/config.cpp b/src/io/config.cpp index 963ef084578..18c0562a676 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -321,11 +321,21 @@ void Config::CheckParamConflict() { num_leaves = static_cast(full_num_leaves); } } + // force col-wise for gpu if (device_type == std::string("gpu")) { force_col_wise = true; force_row_wise = false; } + +#ifdef USE_CUDA + // force col-wise for CUDA + if (device_type == std::string("cuda")) { + force_col_wise = true; + force_row_wise = false; + } +#endif + // min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated // the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can // be 1 when there is actually no data in the leaf. In rare cases this can cause a bug because with path smoothing the diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c96a83516bf..edae575f345 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -627,10 +627,6 @@ TrainingShareStates* Dataset::GetShareStates( share_state->is_constant_hessian = is_constant_hessian; return share_state; } -#ifdef USE_CUDA - force_colwise = 1; - force_rowwise = 0; -#endif if (force_colwise) { TrainingShareStates* share_state = new TrainingShareStates(); share_state->SetMultiValBin(GetMultiBinFromSparseFeatures()); From 44ce402d27b99ce828d92498399af7383339b1db Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 8 Jun 2020 17:17:17 +0000 Subject: [PATCH 060/119] Initial CUDA work --- .../kernels/histogram_16_64_256.cu | 34 ------------------- 1 file changed, 34 deletions(-) diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 7002ac71ded..a0780f913c9 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -844,40 +844,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; -#if 0 -if (gtid == 0) { -#if USE_CONSTANT_BUF == 1 -#ifdef IGNORE_INDICES -#if CONST_HESSIAN == 0 -printf("KERNEL USE_CONSTANT_BUF IGNORE_INDICES\n"); -#else -printf("KERNEL USE_CONSTANT_BUF IGNORE_INDICES CONST_HESSIAN\n"); -#endif -#else -#if CONST_HESSIAN == 0 -printf("KERNEL USE_CONSTANT_BUF \n"); -#else -printf("KERNEL USE_CONSTANT_BUF CONST_HESSIAN\n"); -#endif -#endif -#else -#ifdef IGNORE_INDICES -#if CONST_HESSIAN == 0 -printf("KERNEL IGNORE_INDICES (exp = %d) (feature_size = %d)\n", (int) power_feature_workgroups, (int) feature_size); -#else -printf("KERNEL IGNORE_INDICES CONST_HESSIAN\n"); -#endif -#else -#if CONST_HESSIAN == 0 -printf("KERNEL (exp = %d) (feature_size = %d)\n", (int) power_feature_workgroups, (int) feature_size); -#else -printf("KERNEL CONST_HESSIAN\n"); -#endif -#endif -#endif -} -#endif - // local memory per workgroup is 3 KB // clear local memory uint *ptr = (uint *) shared_array; From d7e34de5ef668e5c1da8a2bba3add18ac5fcd69d Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 9 Jun 2020 13:12:19 +0000 Subject: [PATCH 061/119] Initial CUDA work --- src/c_api.cpp | 168 +++++++++----------------------------------------- 1 file changed, 28 insertions(+), 140 deletions(-) diff --git a/src/c_api.cpp b/src/c_api.cpp index 03a3db597bb..6a54b5f1788 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -657,6 +657,20 @@ int LGBM_GetDeviceType() { #endif } +//LGBM_CUDA +void AdditionalConfig(Config *config) +{ +#ifdef USE_CUDA + if (config->device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;} + if (config->bagging_freq == 0) {config->bagging_freq = 1;} + } +#endif +} + int LGBM_DatasetCreateFromFile(const char* filename, const char* parameters, const DatasetHandle reference, @@ -669,16 +683,7 @@ int LGBM_DatasetCreateFromFile(const char* filename, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); DatasetLoader loader(config, nullptr, 1, filename); if (reference == nullptr) { @@ -711,16 +716,7 @@ int LGBM_DatasetCreateFromSampledColumn(double** sample_data, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); DatasetLoader loader(config, nullptr, 1, nullptr); *out = loader.CostructFromSampleData(sample_data, sample_indices, ncol, num_per_col, @@ -834,16 +830,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); std::unique_ptr ret; int32_t total_nrow = 0; @@ -937,16 +924,7 @@ int LGBM_DatasetCreateFromCSR(const void* indptr, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); std::unique_ptr ret; auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); @@ -1016,16 +994,7 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); std::unique_ptr ret; int32_t nrow = num_rows; @@ -1099,16 +1068,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); std::unique_ptr ret; int32_t nrow = static_cast(num_row); @@ -1194,16 +1154,7 @@ int LGBM_DatasetGetSubset( omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); auto full_dataset = reinterpret_cast(handle); CHECK_GT(num_used_row_indices, 0); @@ -1601,16 +1552,7 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); Booster* ref_booster = reinterpret_cast(handle); ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header, @@ -1657,16 +1599,7 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); @@ -1703,16 +1636,7 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); @@ -1744,16 +1668,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); int num_threads = OMP_NUM_THREADS(); int ncol = static_cast(ncol_ptr - 1); @@ -1800,16 +1715,7 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major); @@ -1836,16 +1742,7 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major); @@ -1872,16 +1769,7 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, omp_set_num_threads(config.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config.bagging_fraction == 1.0){config.bagging_fraction = 0.8;} - if (config.bagging_freq == 0) {config.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config); Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type); From 8af3738907029d4e78d78ba0f450fb2d280df7e1 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 9 Jun 2020 16:55:03 +0000 Subject: [PATCH 062/119] Initial CUDA work --- src/c_api.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/c_api.cpp b/src/c_api.cpp index 6a54b5f1788..ba6ef705573 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -668,6 +668,8 @@ void AdditionalConfig(Config *config) if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;} if (config->bagging_freq == 0) {config->bagging_freq = 1;} } +#else + (void)(config); // UNUSED #endif } From 903e52b752c2f0ed54e9a61edb6c0a8871cd6f77 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 9 Jun 2020 18:30:42 +0000 Subject: [PATCH 063/119] Initial CUDA work --- src/c_api.cpp | 53 ++++++++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/src/c_api.cpp b/src/c_api.cpp index ba6ef705573..9ccac3893e5 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -43,6 +43,22 @@ inline int LGBM_APIHandleException(const std::string& ex) { return -1; } +//LGBM_CUDA +inline void AdditionalConfig(Config *config) +{ +#ifdef USE_CUDA + if (config->device_type == std::string("cuda")){ + LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + + config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ + if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;} + if (config->bagging_freq == 0) {config->bagging_freq = 1;} + } +#else + (void)(config); // UNUSED +#endif +} + #define API_BEGIN() try { #define API_END() } \ catch(std::exception& ex) { return LGBM_APIHandleException(ex); } \ @@ -120,16 +136,10 @@ class Booster { if (train_data->num_data() < 2048){ config_.device_type = std::string("cpu"); } - - if (config_.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;} - if (config_.bagging_freq == 0) {config_.bagging_freq = 1;} - } #endif + AdditionalConfig(&config_); + // create boosting if (config_.input_model.size() > 0) { Log::Warning("Continued train from model is not supported for c_api,\n" @@ -323,16 +333,7 @@ class Booster { omp_set_num_threads(config_.num_threads); } -//LGBM_CUDA -#ifdef USE_CUDA - if (config_.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;} - if (config_.bagging_freq == 0) {config_.bagging_freq = 1;} - } -#endif + AdditionalConfig(&config_); if (param.count("objective")) { // create objective function @@ -657,22 +658,6 @@ int LGBM_GetDeviceType() { #endif } -//LGBM_CUDA -void AdditionalConfig(Config *config) -{ -#ifdef USE_CUDA - if (config->device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; - - config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;} - if (config->bagging_freq == 0) {config->bagging_freq = 1;} - } -#else - (void)(config); // UNUSED -#endif -} - int LGBM_DatasetCreateFromFile(const char* filename, const char* parameters, const DatasetHandle reference, From 1efcad08a3fae62fd611ca7d601745a81c965287 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Thu, 21 May 2020 02:50:58 +0800 Subject: [PATCH 064/119] redirect log to python console (#3090) * redir log to python console * fix pylint * Apply suggestions from code review * Update basic.py * Apply suggestions from code review Co-authored-by: Nikita Titov * Update c_api.h * Apply suggestions from code review * Apply suggestions from code review * super-minor: better wording Co-authored-by: Nikita Titov Co-authored-by: StrikerRUS --- src/c_api.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/c_api.cpp b/src/c_api.cpp index 9ccac3893e5..54fabba1665 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -658,6 +658,12 @@ int LGBM_GetDeviceType() { #endif } +int LGBM_RegisterLogCallback(void (*callback)(const char*)) { + API_BEGIN(); + Log::ResetCallBack(callback); + API_END(); +} + int LGBM_DatasetCreateFromFile(const char* filename, const char* parameters, const DatasetHandle reference, From 13c6450a84c484de0177c089d77e93cd3c0ddf9a Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 1 Jun 2020 20:13:32 +0100 Subject: [PATCH 065/119] re-order includes (fixes #3132) (#3133) --- R-package/src/lightgbm_R.cpp | 14 +++++++------- include/LightGBM/application.h | 6 +++--- include/LightGBM/bin.h | 8 ++++---- include/LightGBM/boosting.h | 6 +++--- include/LightGBM/c_api.h | 4 ++-- include/LightGBM/config.h | 10 +++++----- include/LightGBM/dataset.h | 14 +++++++------- include/LightGBM/dataset_loader.h | 4 ++-- include/LightGBM/feature_group.h | 8 ++++---- include/LightGBM/metric.h | 6 +++--- include/LightGBM/network.h | 8 ++++---- include/LightGBM/objective_function.h | 6 +++--- include/LightGBM/prediction_early_stop.h | 4 ++-- include/LightGBM/tree.h | 6 +++--- include/LightGBM/tree_learner.h | 6 +++--- include/LightGBM/utils/array_args.h | 6 +++--- include/LightGBM/utils/common.h | 5 ++--- include/LightGBM/utils/openmp_wrapper.h | 8 ++++---- include/LightGBM/utils/pipeline_reader.h | 6 +++--- include/LightGBM/utils/text_reader.h | 8 ++++---- include/LightGBM/utils/threading.h | 8 ++++---- src/application/application.cpp | 17 +++++++++-------- src/application/predictor.hpp | 12 ++++++------ src/boosting/dart.hpp | 4 ++-- src/boosting/gbdt.cpp | 9 +++++---- src/boosting/gbdt.h | 16 ++++++++-------- src/boosting/gbdt_model_text.cpp | 9 +++++---- src/boosting/gbdt_prediction.cpp | 4 ++-- src/boosting/goss.hpp | 8 ++++---- src/boosting/prediction_early_stop.cpp | 7 ++++--- src/boosting/rf.hpp | 6 +++--- src/boosting/score_updater.hpp | 6 +++--- src/c_api.cpp | 17 +++++++++++++---- src/io/bin.cpp | 11 ++++++----- src/io/config.cpp | 5 +++-- src/io/dataset.cpp | 13 +++++++------ src/io/dataset_loader.cpp | 5 +++-- src/io/dense_bin.hpp | 4 ++-- src/io/file_io.cpp | 7 ++++--- src/io/json11.cpp | 7 ++++--- src/io/metadata.cpp | 5 +++-- src/io/multi_val_dense_bin.hpp | 6 +++--- src/io/multi_val_sparse_bin.hpp | 6 +++--- src/io/parser.cpp | 3 ++- src/io/parser.hpp | 8 ++++---- src/io/sparse_bin.hpp | 8 ++++---- src/io/tree.cpp | 9 +++++---- src/main.cpp | 3 ++- src/metric/binary_metric.hpp | 8 ++++---- src/metric/dcg_calculator.cpp | 5 +++-- src/metric/map_metric.hpp | 10 +++++----- src/metric/multiclass_metric.hpp | 6 +++--- src/metric/rank_metric.hpp | 8 ++++---- src/metric/regression_metric.hpp | 6 +++--- src/metric/xentropy_metric.hpp | 10 +++++----- src/network/linker_topo.cpp | 7 ++++--- src/network/linkers.h | 10 +++++----- src/network/linkers_socket.cpp | 8 ++++---- src/network/network.cpp | 7 ++++--- src/network/socket_wrapper.hpp | 4 ++-- src/objective/binary_objective.hpp | 6 +++--- src/objective/multiclass_objective.hpp | 6 +++--- src/objective/rank_objective.hpp | 6 +++--- src/objective/regression_objective.hpp | 8 ++++---- src/objective/xentropy_objective.hpp | 8 ++++---- src/treelearner/col_sampler.hpp | 3 +++ .../cost_effective_gradient_boosting.hpp | 4 ++-- src/treelearner/data_partition.hpp | 8 ++++---- src/treelearner/feature_histogram.hpp | 8 ++++---- src/treelearner/gpu_tree_learner.cpp | 4 ++-- src/treelearner/gpu_tree_learner.h | 12 ++++++------ src/treelearner/leaf_splits.hpp | 4 ++-- src/treelearner/parallel_tree_learner.h | 6 +++--- src/treelearner/serial_tree_learner.cpp | 10 +++++----- src/treelearner/serial_tree_learner.h | 14 +++++++------- src/treelearner/split_info.hpp | 4 ++-- .../voting_parallel_tree_learner.cpp | 3 ++- 77 files changed, 299 insertions(+), 270 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index f3165e1fa1a..14609272fa3 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -5,13 +5,6 @@ #include "lightgbm_R.h" -#include -#include -#include -#include - -#include - #include #include #include @@ -19,6 +12,13 @@ #include #include +#include +#include +#include +#include + +#include + #define COL_MAJOR (0) #define R_API_BEGIN() \ diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h index 911dedd7d94..53f9732edea 100644 --- a/include/LightGBM/application.h +++ b/include/LightGBM/application.h @@ -5,12 +5,12 @@ #ifndef LIGHTGBM_APPLICATION_H_ #define LIGHTGBM_APPLICATION_H_ -#include -#include - #include #include +#include +#include + namespace LightGBM { class DatasetLoader; diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 96ae6a8d641..fab69d9ba89 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -5,10 +5,6 @@ #ifndef LIGHTGBM_BIN_H_ #define LIGHTGBM_BIN_H_ -#include -#include -#include - #include #include #include @@ -16,6 +12,10 @@ #include #include +#include +#include +#include + namespace LightGBM { enum BinType { diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index 31bb430f0ae..f456d798977 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_BOOSTING_H_ #define LIGHTGBM_BOOSTING_H_ -#include -#include - #include #include #include #include +#include +#include + namespace LightGBM { /*! \brief forward declaration */ diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 3fbccdac075..553982eefed 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -13,12 +13,12 @@ #ifndef LIGHTGBM_C_API_H_ #define LIGHTGBM_C_API_H_ -#include - #include #include #include +#include + typedef void* DatasetHandle; /*!< \brief Handle of dataset. */ typedef void* BoosterHandle; /*!< \brief Handle of booster. */ diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 162c7583dc7..bbb62727623 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -11,11 +11,6 @@ #ifndef LIGHTGBM_CONFIG_H_ #define LIGHTGBM_CONFIG_H_ -#include -#include -#include -#include - #include #include #include @@ -23,6 +18,11 @@ #include #include +#include +#include +#include +#include + namespace LightGBM { /*! \brief Types of tasks */ diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index e4c5dc56511..bd0143daffd 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -5,13 +5,6 @@ #ifndef LIGHTGBM_DATASET_H_ #define LIGHTGBM_DATASET_H_ -#include -#include -#include -#include -#include -#include - #include #include #include @@ -20,6 +13,13 @@ #include #include +#include +#include +#include +#include +#include +#include + namespace LightGBM { /*! \brief forward declaration */ diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 8d5b20b481f..88443d62472 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -5,12 +5,12 @@ #ifndef LIGHTGBM_DATASET_LOADER_H_ #define LIGHTGBM_DATASET_LOADER_H_ -#include - #include #include #include +#include + namespace LightGBM { class DatasetLoader { diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index d949beec20e..c21ad33b6a4 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_FEATURE_GROUP_H_ #define LIGHTGBM_FEATURE_GROUP_H_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { class Dataset; diff --git a/include/LightGBM/metric.h b/include/LightGBM/metric.h index 61d9fc99ea8..56fec3aad77 100644 --- a/include/LightGBM/metric.h +++ b/include/LightGBM/metric.h @@ -5,6 +5,9 @@ #ifndef LIGHTGBM_METRIC_H_ #define LIGHTGBM_METRIC_H_ +#include +#include + #include #include #include @@ -12,9 +15,6 @@ #include #include -#include -#include - namespace LightGBM { /*! diff --git a/include/LightGBM/network.h b/include/LightGBM/network.h index 32c24fe6984..40373508eb5 100644 --- a/include/LightGBM/network.h +++ b/include/LightGBM/network.h @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_NETWORK_H_ #define LIGHTGBM_NETWORK_H_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { /*! \brief forward declaration */ diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 5ea838dece2..76b3f7145ef 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_OBJECTIVE_FUNCTION_H_ #define LIGHTGBM_OBJECTIVE_FUNCTION_H_ +#include +#include + #include #include #include -#include -#include - namespace LightGBM { /*! * \brief The interface of Objective Function. diff --git a/include/LightGBM/prediction_early_stop.h b/include/LightGBM/prediction_early_stop.h index 1d3e510981f..40db533325f 100644 --- a/include/LightGBM/prediction_early_stop.h +++ b/include/LightGBM/prediction_early_stop.h @@ -5,11 +5,11 @@ #ifndef LIGHTGBM_PREDICTION_EARLY_STOP_H_ #define LIGHTGBM_PREDICTION_EARLY_STOP_H_ -#include - #include #include +#include + namespace LightGBM { struct PredictionEarlyStopInstance { diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 5ce3ff9b3eb..f370bc74213 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_TREE_H_ #define LIGHTGBM_TREE_H_ -#include -#include - #include #include #include #include #include +#include +#include + namespace LightGBM { #define kCategoricalMask (1) diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index 6c549a5ed71..cdb3d2823b8 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_TREE_LEARNER_H_ #define LIGHTGBM_TREE_LEARNER_H_ +#include +#include + #include #include #include -#include -#include - namespace LightGBM { using json11::Json; diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h index 0183ecc22dd..a071247fb28 100644 --- a/include/LightGBM/utils/array_args.h +++ b/include/LightGBM/utils/array_args.h @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_UTILS_ARRAY_AGRS_H_ #define LIGHTGBM_UTILS_ARRAY_AGRS_H_ -#include -#include - #include #include #include +#include +#include + namespace LightGBM { /*! diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index bdc769e5222..663ea1730d3 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -5,9 +5,6 @@ #ifndef LIGHTGBM_UTILS_COMMON_FUN_H_ #define LIGHTGBM_UTILS_COMMON_FUN_H_ -#include -#include - #include #include #include @@ -30,6 +27,8 @@ #include #pragma intrinsic(_BitScanReverse) #endif +#include +#include #if defined(_MSC_VER) #include diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h index 075c991371c..fdd4b3850fb 100644 --- a/include/LightGBM/utils/openmp_wrapper.h +++ b/include/LightGBM/utils/openmp_wrapper.h @@ -6,16 +6,16 @@ #define LIGHTGBM_OPENMP_WRAPPER_H_ #ifdef _OPENMP -#include - -#include - #include #include #include #include #include +#include + +#include + inline int OMP_NUM_THREADS() { int ret = 1; #pragma omp parallel diff --git a/include/LightGBM/utils/pipeline_reader.h b/include/LightGBM/utils/pipeline_reader.h index f02500c9751..4e07b8b3674 100644 --- a/include/LightGBM/utils/pipeline_reader.h +++ b/include/LightGBM/utils/pipeline_reader.h @@ -5,9 +5,6 @@ #ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_ #define LIGHTGBM_UTILS_PIPELINE_READER_H_ -#include -#include - #include #include #include @@ -16,6 +13,9 @@ #include #include +#include +#include + namespace LightGBM { /*! diff --git a/include/LightGBM/utils/text_reader.h b/include/LightGBM/utils/text_reader.h index 638bb268362..7aaf7f8153a 100644 --- a/include/LightGBM/utils/text_reader.h +++ b/include/LightGBM/utils/text_reader.h @@ -5,16 +5,16 @@ #ifndef LIGHTGBM_UTILS_TEXT_READER_H_ #define LIGHTGBM_UTILS_TEXT_READER_H_ -#include -#include -#include - #include #include #include #include #include +#include +#include +#include + namespace LightGBM { const size_t kGbs = size_t(1024) * 1024 * 1024; diff --git a/include/LightGBM/utils/threading.h b/include/LightGBM/utils/threading.h index d293fc811eb..dcf4f7608af 100644 --- a/include/LightGBM/utils/threading.h +++ b/include/LightGBM/utils/threading.h @@ -6,14 +6,14 @@ #ifndef LIGHTGBM_UTILS_THREADING_H_ #define LIGHTGBM_UTILS_THREADING_H_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { class Threading { diff --git a/src/application/application.cpp b/src/application/application.cpp index 1b9eabf8a12..a46cf419c53 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -2,6 +2,15 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + +#include +#include +#include +#include +#include +#include +#include + #include #include @@ -15,14 +24,6 @@ #include #include -#include -#include -#include -#include -#include -#include -#include - #include "predictor.hpp" #ifdef USE_CUDA diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index 1c56cfa5eb2..ab775d42913 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -5,12 +5,6 @@ #ifndef LIGHTGBM_PREDICTOR_HPP_ #define LIGHTGBM_PREDICTOR_HPP_ -#include -#include -#include -#include -#include - #include #include #include @@ -21,6 +15,12 @@ #include #include +#include +#include +#include +#include +#include + namespace LightGBM { /*! diff --git a/src/boosting/dart.hpp b/src/boosting/dart.hpp index e2481e79772..b9dca6a78f2 100644 --- a/src/boosting/dart.hpp +++ b/src/boosting/dart.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_BOOSTING_DART_H_ #define LIGHTGBM_BOOSTING_DART_H_ -#include - #include #include #include #include #include +#include + #include "gbdt.h" #include "score_updater.hpp" diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 24264c3c175..6199d726df9 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -2,8 +2,13 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include "gbdt.h" +#include +#include +#include + #include #include #include @@ -11,10 +16,6 @@ #include #include -#include -#include -#include - namespace LightGBM { #ifdef USE_CUDA diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index d460894d44b..420d5479e04 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -5,12 +5,6 @@ #ifndef LIGHTGBM_BOOSTING_GBDT_H_ #define LIGHTGBM_BOOSTING_GBDT_H_ -#include -#include -#include -#include -#include - #include #include #include @@ -22,12 +16,18 @@ #include #include +#include +#include +#include +#include +#include + +#include "score_updater.hpp" + #ifdef USE_CUDA #include //LGBM_CUDA #endif -#include "score_updater.hpp" - namespace LightGBM { using json11::Json; diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 5ce26bca95c..9ac4b269ac1 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -2,16 +2,17 @@ * Copyright (c) 2017 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + +#include +#include +#include + #include #include #include #include #include -#include -#include -#include - #include "gbdt.h" namespace LightGBM { diff --git a/src/boosting/gbdt_prediction.cpp b/src/boosting/gbdt_prediction.cpp index b4711f7c01a..e906bc0aaca 100644 --- a/src/boosting/gbdt_prediction.cpp +++ b/src/boosting/gbdt_prediction.cpp @@ -2,12 +2,12 @@ * Copyright (c) 2017 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#include "gbdt.h" + #include #include #include -#include "gbdt.h" - namespace LightGBM { void GBDT::PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const { diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 2af6dee14f6..d3a3c6a344c 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -5,10 +5,6 @@ #ifndef LIGHTGBM_BOOSTING_GOSS_H_ #define LIGHTGBM_BOOSTING_GOSS_H_ -#include -#include -#include - #include #include #include @@ -16,6 +12,10 @@ #include #include +#include +#include +#include + #include "gbdt.h" #include "score_updater.hpp" diff --git a/src/boosting/prediction_early_stop.cpp b/src/boosting/prediction_early_stop.cpp index 7e21141f685..7eda08f00d6 100644 --- a/src/boosting/prediction_early_stop.cpp +++ b/src/boosting/prediction_early_stop.cpp @@ -2,15 +2,16 @@ * Copyright (c) 2017 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include - -#include #include #include #include #include +#include + +#include + namespace LightGBM { PredictionEarlyStopInstance CreateNone(const PredictionEarlyStopConfig&) { diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index e64bf6cb4d8..8bbc1a3ebee 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -5,9 +5,6 @@ #ifndef LIGHTGBM_BOOSTING_RF_H_ #define LIGHTGBM_BOOSTING_RF_H_ -#include -#include - #include #include #include @@ -15,6 +12,9 @@ #include #include +#include +#include + #include "gbdt.h" #include "score_updater.hpp" diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp index 7446691a470..231de245068 100644 --- a/src/boosting/score_updater.hpp +++ b/src/boosting/score_updater.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_ #define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_ +#include +#include + #include #include #include #include #include -#include -#include - namespace LightGBM { /*! * \brief Used to store and update score for data diff --git a/src/c_api.cpp b/src/c_api.cpp index 54fabba1665..979ab104b74 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -2,6 +2,15 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + +#include +#include +#include +#include +#include +#include +#include + #include #include @@ -18,10 +27,6 @@ #include #include -#ifdef USE_CUDA -#include -#endif - #include #include #include @@ -32,6 +37,10 @@ #include "application/predictor.hpp" +#ifdef USE_CUDA +#include +#endif + namespace LightGBM { inline int LGBM_APIHandleException(const std::exception& ex) { diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 367edaa3f7b..9ead232fda0 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -2,17 +2,18 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include - -#include -#include -#include #include #include #include #include +#include + +#include +#include +#include + #include "dense_bin.hpp" #include "multi_val_dense_bin.hpp" #include "multi_val_sparse_bin.hpp" diff --git a/src/io/config.cpp b/src/io/config.cpp index 18c0562a676..4e4d8dbc794 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -2,14 +2,15 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + +#include + #include #include #include #include -#include - namespace LightGBM { void Config::KV2Map(std::unordered_map* params, const char* kv) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index edae575f345..1001a9432ce 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -3,12 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ -#include - -#include -#include -#include -#include #ifdef USE_CUDA #include @@ -20,6 +14,13 @@ #include #include +#include + +#include +#include +#include +#include + namespace LightGBM { const char* Dataset::binary_file_token = diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index c0b2edf1a8c..33ce1df7eb0 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,9 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + +#include + #include #include @@ -10,8 +13,6 @@ #include #include -#include - namespace LightGBM { using json11::Json; diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 99feadf9f7f..10d988b68e0 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -6,8 +6,6 @@ #ifndef LIGHTGBM_IO_DENSE_BIN_HPP_ #define LIGHTGBM_IO_DENSE_BIN_HPP_ -#include - #include #include #include @@ -18,6 +16,8 @@ #include // LGBM_CUDA +#include + namespace LightGBM { template diff --git a/src/io/file_io.cpp b/src/io/file_io.cpp index a205964287e..67a719de0a8 100644 --- a/src/io/file_io.cpp +++ b/src/io/file_io.cpp @@ -3,14 +3,15 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ -#include - -#include #include #include #include +#include + +#include + #ifdef USE_HDFS #include #endif diff --git a/src/io/json11.cpp b/src/io/json11.cpp index db21c6aab54..a3fec7724b5 100644 --- a/src/io/json11.cpp +++ b/src/io/json11.cpp @@ -18,15 +18,16 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#include - -#include #include #include #include #include +#include + +#include + namespace json11 { static const int max_depth = 200; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index ea0d5b08def..9b540045650 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -2,12 +2,13 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include -#include #include #include +#include +#include + namespace LightGBM { Metadata::Metadata() { diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 7645530d774..d7c6599c381 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_ #define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_ -#include -#include - #include #include #include #include +#include +#include + namespace LightGBM { template diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index ec3f64a11a0..09c13420c66 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_ #define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_ -#include -#include - #include #include #include #include +#include +#include + namespace LightGBM { template diff --git a/src/io/parser.cpp b/src/io/parser.cpp index df14ea87a99..c30da4305f9 100644 --- a/src/io/parser.cpp +++ b/src/io/parser.cpp @@ -2,7 +2,6 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include "parser.hpp" #include #include @@ -11,6 +10,8 @@ #include #include +#include "parser.hpp" + namespace LightGBM { void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt) { diff --git a/src/io/parser.hpp b/src/io/parser.hpp index 1cfde0635a5..43764a68e75 100644 --- a/src/io/parser.hpp +++ b/src/io/parser.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_IO_PARSER_HPP_ #define LIGHTGBM_IO_PARSER_HPP_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { class CSVParser: public Parser { diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index c56cd6da99d..730ea161a2a 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -6,10 +6,6 @@ #ifndef LIGHTGBM_IO_SPARSE_BIN_HPP_ #define LIGHTGBM_IO_SPARSE_BIN_HPP_ -#include -#include -#include - #include #include #include @@ -17,6 +13,10 @@ #include #include +#include +#include +#include + namespace LightGBM { template diff --git a/src/io/tree.cpp b/src/io/tree.cpp index 63641311787..759d334ef98 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -2,16 +2,17 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + +#include +#include +#include + #include #include #include #include -#include -#include -#include - namespace LightGBM { Tree::Tree(int max_leaves, bool track_branch_features) diff --git a/src/main.cpp b/src/main.cpp index ef277ac0c1f..0a8931ae4ff 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -2,10 +2,11 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include #include +#include + #include "network/linkers.h" int main(int argc, char** argv) { diff --git a/src/metric/binary_metric.hpp b/src/metric/binary_metric.hpp index 00a51d6cd24..5bde453cdba 100644 --- a/src/metric/binary_metric.hpp +++ b/src/metric/binary_metric.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_METRIC_BINARY_METRIC_HPP_ #define LIGHTGBM_METRIC_BINARY_METRIC_HPP_ -#include -#include -#include - #include #include #include #include +#include +#include +#include + namespace LightGBM { /*! diff --git a/src/metric/dcg_calculator.cpp b/src/metric/dcg_calculator.cpp index 58843d89f9e..cd477612bdc 100644 --- a/src/metric/dcg_calculator.cpp +++ b/src/metric/dcg_calculator.cpp @@ -2,13 +2,14 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include -#include #include #include #include +#include +#include + namespace LightGBM { /*! \brief Declaration for some static members */ diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp index 18539ee44ee..b373a02b56f 100644 --- a/src/metric/map_metric.hpp +++ b/src/metric/map_metric.hpp @@ -5,16 +5,16 @@ #ifndef LIGHTGBM_METRIC_MAP_METRIC_HPP_ #define LIGHTGBM_METRIC_MAP_METRIC_HPP_ -#include -#include -#include -#include - #include #include #include #include +#include +#include +#include +#include + namespace LightGBM { class MapMetric:public Metric { diff --git a/src/metric/multiclass_metric.hpp b/src/metric/multiclass_metric.hpp index 59548cd3a79..f60588cac3f 100644 --- a/src/metric/multiclass_metric.hpp +++ b/src/metric/multiclass_metric.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_METRIC_MULTICLASS_METRIC_HPP_ #define LIGHTGBM_METRIC_MULTICLASS_METRIC_HPP_ -#include -#include - #include #include #include #include +#include +#include + namespace LightGBM { /*! * \brief Metric for multiclass task. diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index 3b3afb547eb..d9227502009 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_METRIC_RANK_METRIC_HPP_ #define LIGHTGBM_METRIC_RANK_METRIC_HPP_ +#include +#include +#include + #include #include #include #include -#include -#include -#include - namespace LightGBM { class NDCGMetric:public Metric { diff --git a/src/metric/regression_metric.hpp b/src/metric/regression_metric.hpp index 4d1a3662142..3085bc941b3 100644 --- a/src/metric/regression_metric.hpp +++ b/src/metric/regression_metric.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_METRIC_REGRESSION_METRIC_HPP_ #define LIGHTGBM_METRIC_REGRESSION_METRIC_HPP_ -#include -#include - #include #include #include #include +#include +#include + namespace LightGBM { /*! * \brief Metric for regression task. diff --git a/src/metric/xentropy_metric.hpp b/src/metric/xentropy_metric.hpp index bec611d28e5..1b86e60e640 100644 --- a/src/metric/xentropy_metric.hpp +++ b/src/metric/xentropy_metric.hpp @@ -5,16 +5,16 @@ #ifndef LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_ #define LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_ -#include -#include -#include -#include - #include #include #include #include +#include +#include +#include +#include + /* * Implements three related metrics: * diff --git a/src/network/linker_topo.cpp b/src/network/linker_topo.cpp index 1d7b2990f0e..102fdc993cd 100644 --- a/src/network/linker_topo.cpp +++ b/src/network/linker_topo.cpp @@ -2,14 +2,15 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include -#include -#include #include #include #include +#include +#include +#include + namespace LightGBM { diff --git a/src/network/linkers.h b/src/network/linkers.h index 8ddbb902a15..5a91d40f73d 100644 --- a/src/network/linkers.h +++ b/src/network/linkers.h @@ -5,11 +5,6 @@ #ifndef LIGHTGBM_NETWORK_LINKERS_H_ #define LIGHTGBM_NETWORK_LINKERS_H_ -#include -#include -#include -#include - #include #include #include @@ -18,6 +13,11 @@ #include #include +#include +#include +#include +#include + #ifdef USE_SOCKET #include "socket_wrapper.hpp" #endif diff --git a/src/network/linkers_socket.cpp b/src/network/linkers_socket.cpp index 40c6de7aab6..708e3ae9bad 100644 --- a/src/network/linkers_socket.cpp +++ b/src/network/linkers_socket.cpp @@ -4,10 +4,6 @@ */ #ifdef USE_SOCKET -#include -#include -#include - #include #include #include @@ -16,6 +12,10 @@ #include #include +#include +#include +#include + #include "linkers.h" namespace LightGBM { diff --git a/src/network/network.cpp b/src/network/network.cpp index 3976d72a692..cbc241d88c6 100644 --- a/src/network/network.cpp +++ b/src/network/network.cpp @@ -2,13 +2,14 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include - -#include #include #include +#include + +#include + #include "linkers.h" namespace LightGBM { diff --git a/src/network/socket_wrapper.hpp b/src/network/socket_wrapper.hpp index 70f9586b99c..87bd88f934a 100644 --- a/src/network/socket_wrapper.hpp +++ b/src/network/socket_wrapper.hpp @@ -6,13 +6,13 @@ #define LIGHTGBM_NETWORK_SOCKET_WRAPPER_HPP_ #ifdef USE_SOCKET -#include - #include #include #include #include +#include + #if defined(_WIN32) #ifdef _MSC_VER diff --git a/src/objective/binary_objective.hpp b/src/objective/binary_objective.hpp index 4861bd1b83f..dff3a9c1097 100644 --- a/src/objective/binary_objective.hpp +++ b/src/objective/binary_objective.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_OBJECTIVE_BINARY_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_BINARY_OBJECTIVE_HPP_ -#include -#include - #include #include #include #include #include +#include +#include + namespace LightGBM { /*! * \brief Objective function for binary classification diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp index c133e1f75fd..da604baeb49 100644 --- a/src/objective/multiclass_objective.hpp +++ b/src/objective/multiclass_objective.hpp @@ -5,9 +5,6 @@ #ifndef LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_ -#include -#include - #include #include #include @@ -15,6 +12,9 @@ #include #include +#include +#include + #include "binary_objective.hpp" namespace LightGBM { diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 1b20721e238..730d358f030 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -6,9 +6,6 @@ #ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ -#include -#include - #include #include #include @@ -17,6 +14,9 @@ #include #include +#include +#include + namespace LightGBM { /*! diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp index 53fa7020be5..fe391f2c15f 100644 --- a/src/objective/regression_objective.hpp +++ b/src/objective/regression_objective.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { #define PercentileFun(T, data_reader, cnt_data, alpha) \ diff --git a/src/objective/xentropy_objective.hpp b/src/objective/xentropy_objective.hpp index cad2b8faafd..3a145bd2cbb 100644 --- a/src/objective/xentropy_objective.hpp +++ b/src/objective/xentropy_objective.hpp @@ -5,16 +5,16 @@ #ifndef LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_ -#include -#include -#include - #include #include #include #include #include +#include +#include +#include + /* * Implements gradients and hessians for the following point losses. * Target y is anything in interval [0, 1]. diff --git a/src/treelearner/col_sampler.hpp b/src/treelearner/col_sampler.hpp index cd288481255..4876827eb39 100644 --- a/src/treelearner/col_sampler.hpp +++ b/src/treelearner/col_sampler.hpp @@ -6,6 +6,9 @@ #ifndef LIGHTGBM_TREELEARNER_COL_SAMPLER_HPP_ #define LIGHTGBM_TREELEARNER_COL_SAMPLER_HPP_ +#include +#include + #include #include #include diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp index fad966e6487..fda2bffc820 100644 --- a/src/treelearner/cost_effective_gradient_boosting.hpp +++ b/src/treelearner/cost_effective_gradient_boosting.hpp @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_ #define LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_ +#include + #include #include #include #include -#include - #include "data_partition.hpp" #include "serial_tree_learner.h" #include "split_info.hpp" diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp index 01c5d2606e7..bdae960c1d9 100644 --- a/src/treelearner/data_partition.hpp +++ b/src/treelearner/data_partition.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_ #define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_ +#include +#include +#include + #include #include #include #include -#include -#include -#include - namespace LightGBM { /*! * \brief DataPartition is used to store the the partition of data on tree. diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 8916ee48fd4..4dd5f29af0a 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -6,10 +6,6 @@ #ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ -#include -#include -#include - #include #include #include @@ -17,6 +13,10 @@ #include #include +#include +#include +#include + #include "monotone_constraints.hpp" #include "split_info.hpp" diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index fad02e1c044..42967e138c7 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -6,12 +6,12 @@ #include "gpu_tree_learner.h" +#include + #include #include #include -#include - #include "../io/dense_bin.hpp" #define GPU_DEBUG 0 diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h index 598e8d40ac9..c0607cbe4a2 100644 --- a/src/treelearner/gpu_tree_learner.h +++ b/src/treelearner/gpu_tree_learner.h @@ -5,12 +5,6 @@ #ifndef LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_ #define LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_ -#include -#include -#include -#include -#include - #include #include #include @@ -18,6 +12,12 @@ #include #include +#include +#include +#include +#include +#include + #include "data_partition.hpp" #include "feature_histogram.hpp" #include "leaf_splits.hpp" diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index b0a753eafe1..5c94846eb7b 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -5,11 +5,11 @@ #ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_ #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_ -#include - #include #include +#include + #include "data_partition.hpp" namespace LightGBM { diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 2fdf542d421..5fbad7352e0 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_ #define LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_ -#include -#include - #include #include #include +#include +#include + #include "gpu_tree_learner.h" #include "serial_tree_learner.h" #include "cuda_tree_learner.h" diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index e5b6626a6bd..9aac1bdf1fd 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -4,16 +4,16 @@ */ #include "serial_tree_learner.h" -#include -#include -#include -#include - #include #include #include #include +#include +#include +#include +#include + #include "cost_effective_gradient_boosting.hpp" namespace LightGBM { diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 367c262192c..6fc0c2cb968 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -5,13 +5,6 @@ #ifndef LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_ #define LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_ -#include -#include -#include -#include -#include -#include - #include #include #include @@ -19,6 +12,13 @@ #include #include +#include +#include +#include +#include +#include +#include + #include "col_sampler.hpp" #include "data_partition.hpp" #include "feature_histogram.hpp" diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp index 492434d5160..72dd3fa324c 100644 --- a/src/treelearner/split_info.hpp +++ b/src/treelearner/split_info.hpp @@ -5,8 +5,6 @@ #ifndef LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_ #define LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_ -#include - #include #include #include @@ -14,6 +12,8 @@ #include #include +#include + namespace LightGBM { /*! diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 58f5b88d6b0..043bf0e11b3 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -2,12 +2,13 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include #include #include #include +#include + #include "parallel_tree_learner.h" namespace LightGBM { From 7a6bbb5bd37f7baaf0d929f45f2e5989f786dcfa Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Fri, 5 Jun 2020 18:53:07 +0300 Subject: [PATCH 066/119] Revert "re-order includes (fixes #3132) (#3133)" (#3153) This reverts commit 656d2676c2174781c91747ba158cb6d27f4cacbd. --- R-package/src/lightgbm_R.cpp | 14 +++++++------- include/LightGBM/application.h | 6 +++--- include/LightGBM/bin.h | 8 ++++---- include/LightGBM/boosting.h | 6 +++--- include/LightGBM/c_api.h | 4 ++-- include/LightGBM/config.h | 10 +++++----- include/LightGBM/dataset.h | 14 +++++++------- include/LightGBM/dataset_loader.h | 4 ++-- include/LightGBM/feature_group.h | 8 ++++---- include/LightGBM/metric.h | 6 +++--- include/LightGBM/network.h | 8 ++++---- include/LightGBM/objective_function.h | 6 +++--- include/LightGBM/prediction_early_stop.h | 4 ++-- include/LightGBM/tree.h | 6 +++--- include/LightGBM/tree_learner.h | 6 +++--- include/LightGBM/utils/array_args.h | 6 +++--- include/LightGBM/utils/common.h | 3 +++ include/LightGBM/utils/openmp_wrapper.h | 8 ++++---- include/LightGBM/utils/pipeline_reader.h | 6 +++--- include/LightGBM/utils/text_reader.h | 8 ++++---- include/LightGBM/utils/threading.h | 8 ++++---- src/application/application.cpp | 17 ++++++++--------- src/application/predictor.hpp | 12 ++++++------ src/boosting/dart.hpp | 4 ++-- src/boosting/gbdt.cpp | 9 ++++----- src/boosting/gbdt.h | 12 ++++++------ src/boosting/gbdt_model_text.cpp | 9 ++++----- src/boosting/gbdt_prediction.cpp | 4 ++-- src/boosting/goss.hpp | 8 ++++---- src/boosting/prediction_early_stop.cpp | 7 +++---- src/boosting/rf.hpp | 6 +++--- src/boosting/score_updater.hpp | 6 +++--- src/c_api.cpp | 9 --------- src/io/bin.cpp | 11 +++++------ src/io/config.cpp | 5 ++--- src/io/dataset.cpp | 18 +++++++++--------- src/io/dataset_loader.cpp | 5 ++--- src/io/dense_bin.hpp | 4 ++-- src/io/file_io.cpp | 7 +++---- src/io/json11.cpp | 7 +++---- src/io/metadata.cpp | 5 ++--- src/io/multi_val_dense_bin.hpp | 6 +++--- src/io/multi_val_sparse_bin.hpp | 6 +++--- src/io/parser.cpp | 3 +-- src/io/parser.hpp | 8 ++++---- src/io/sparse_bin.hpp | 8 ++++---- src/io/tree.cpp | 9 ++++----- src/main.cpp | 3 +-- src/metric/binary_metric.hpp | 8 ++++---- src/metric/dcg_calculator.cpp | 5 ++--- src/metric/map_metric.hpp | 10 +++++----- src/metric/multiclass_metric.hpp | 6 +++--- src/metric/rank_metric.hpp | 8 ++++---- src/metric/regression_metric.hpp | 6 +++--- src/metric/xentropy_metric.hpp | 10 +++++----- src/network/linker_topo.cpp | 7 +++---- src/network/linkers.h | 10 +++++----- src/network/linkers_socket.cpp | 8 ++++---- src/network/network.cpp | 7 +++---- src/network/socket_wrapper.hpp | 4 ++-- src/objective/binary_objective.hpp | 6 +++--- src/objective/multiclass_objective.hpp | 6 +++--- src/objective/rank_objective.hpp | 6 +++--- src/objective/regression_objective.hpp | 8 ++++---- src/objective/xentropy_objective.hpp | 8 ++++---- src/treelearner/col_sampler.hpp | 3 --- .../cost_effective_gradient_boosting.hpp | 4 ++-- src/treelearner/data_partition.hpp | 8 ++++---- src/treelearner/feature_histogram.hpp | 8 ++++---- src/treelearner/gpu_tree_learner.cpp | 4 ++-- src/treelearner/gpu_tree_learner.h | 12 ++++++------ src/treelearner/leaf_splits.hpp | 4 ++-- src/treelearner/parallel_tree_learner.h | 6 +++--- src/treelearner/serial_tree_learner.cpp | 10 +++++----- src/treelearner/serial_tree_learner.h | 14 +++++++------- src/treelearner/split_info.hpp | 4 ++-- .../voting_parallel_tree_learner.cpp | 3 +-- 77 files changed, 267 insertions(+), 293 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 14609272fa3..f3165e1fa1a 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -5,13 +5,6 @@ #include "lightgbm_R.h" -#include -#include -#include -#include -#include -#include - #include #include #include @@ -19,6 +12,13 @@ #include +#include +#include +#include +#include +#include +#include + #define COL_MAJOR (0) #define R_API_BEGIN() \ diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h index 53f9732edea..911dedd7d94 100644 --- a/include/LightGBM/application.h +++ b/include/LightGBM/application.h @@ -5,12 +5,12 @@ #ifndef LIGHTGBM_APPLICATION_H_ #define LIGHTGBM_APPLICATION_H_ -#include -#include - #include #include +#include +#include + namespace LightGBM { class DatasetLoader; diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index fab69d9ba89..96ae6a8d641 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -5,6 +5,10 @@ #ifndef LIGHTGBM_BIN_H_ #define LIGHTGBM_BIN_H_ +#include +#include +#include + #include #include #include @@ -12,10 +16,6 @@ #include #include -#include -#include -#include - namespace LightGBM { enum BinType { diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index f456d798977..31bb430f0ae 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_BOOSTING_H_ #define LIGHTGBM_BOOSTING_H_ +#include +#include + #include #include #include #include -#include -#include - namespace LightGBM { /*! \brief forward declaration */ diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 553982eefed..3fbccdac075 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -13,12 +13,12 @@ #ifndef LIGHTGBM_C_API_H_ #define LIGHTGBM_C_API_H_ +#include + #include #include #include -#include - typedef void* DatasetHandle; /*!< \brief Handle of dataset. */ typedef void* BoosterHandle; /*!< \brief Handle of booster. */ diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index bbb62727623..162c7583dc7 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -11,6 +11,11 @@ #ifndef LIGHTGBM_CONFIG_H_ #define LIGHTGBM_CONFIG_H_ +#include +#include +#include +#include + #include #include #include @@ -18,11 +23,6 @@ #include #include -#include -#include -#include -#include - namespace LightGBM { /*! \brief Types of tasks */ diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index bd0143daffd..e4c5dc56511 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -5,6 +5,13 @@ #ifndef LIGHTGBM_DATASET_H_ #define LIGHTGBM_DATASET_H_ +#include +#include +#include +#include +#include +#include + #include #include #include @@ -13,13 +20,6 @@ #include #include -#include -#include -#include -#include -#include -#include - namespace LightGBM { /*! \brief forward declaration */ diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 88443d62472..8d5b20b481f 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -5,12 +5,12 @@ #ifndef LIGHTGBM_DATASET_LOADER_H_ #define LIGHTGBM_DATASET_LOADER_H_ +#include + #include #include #include -#include - namespace LightGBM { class DatasetLoader { diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index c21ad33b6a4..d949beec20e 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_FEATURE_GROUP_H_ #define LIGHTGBM_FEATURE_GROUP_H_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { class Dataset; diff --git a/include/LightGBM/metric.h b/include/LightGBM/metric.h index 56fec3aad77..61d9fc99ea8 100644 --- a/include/LightGBM/metric.h +++ b/include/LightGBM/metric.h @@ -5,9 +5,6 @@ #ifndef LIGHTGBM_METRIC_H_ #define LIGHTGBM_METRIC_H_ -#include -#include - #include #include #include @@ -15,6 +12,9 @@ #include #include +#include +#include + namespace LightGBM { /*! diff --git a/include/LightGBM/network.h b/include/LightGBM/network.h index 40373508eb5..32c24fe6984 100644 --- a/include/LightGBM/network.h +++ b/include/LightGBM/network.h @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_NETWORK_H_ #define LIGHTGBM_NETWORK_H_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { /*! \brief forward declaration */ diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 76b3f7145ef..5ea838dece2 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_OBJECTIVE_FUNCTION_H_ #define LIGHTGBM_OBJECTIVE_FUNCTION_H_ -#include -#include - #include #include #include +#include +#include + namespace LightGBM { /*! * \brief The interface of Objective Function. diff --git a/include/LightGBM/prediction_early_stop.h b/include/LightGBM/prediction_early_stop.h index 40db533325f..1d3e510981f 100644 --- a/include/LightGBM/prediction_early_stop.h +++ b/include/LightGBM/prediction_early_stop.h @@ -5,11 +5,11 @@ #ifndef LIGHTGBM_PREDICTION_EARLY_STOP_H_ #define LIGHTGBM_PREDICTION_EARLY_STOP_H_ +#include + #include #include -#include - namespace LightGBM { struct PredictionEarlyStopInstance { diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index f370bc74213..5ce3ff9b3eb 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_TREE_H_ #define LIGHTGBM_TREE_H_ +#include +#include + #include #include #include #include #include -#include -#include - namespace LightGBM { #define kCategoricalMask (1) diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index cdb3d2823b8..6c549a5ed71 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_TREE_LEARNER_H_ #define LIGHTGBM_TREE_LEARNER_H_ -#include -#include - #include #include #include +#include +#include + namespace LightGBM { using json11::Json; diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h index a071247fb28..0183ecc22dd 100644 --- a/include/LightGBM/utils/array_args.h +++ b/include/LightGBM/utils/array_args.h @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_UTILS_ARRAY_AGRS_H_ #define LIGHTGBM_UTILS_ARRAY_AGRS_H_ +#include +#include + #include #include #include -#include -#include - namespace LightGBM { /*! diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 663ea1730d3..0e26ee84faa 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -5,6 +5,9 @@ #ifndef LIGHTGBM_UTILS_COMMON_FUN_H_ #define LIGHTGBM_UTILS_COMMON_FUN_H_ +#include +#include + #include #include #include diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h index fdd4b3850fb..075c991371c 100644 --- a/include/LightGBM/utils/openmp_wrapper.h +++ b/include/LightGBM/utils/openmp_wrapper.h @@ -6,16 +6,16 @@ #define LIGHTGBM_OPENMP_WRAPPER_H_ #ifdef _OPENMP +#include + +#include + #include #include #include #include #include -#include - -#include - inline int OMP_NUM_THREADS() { int ret = 1; #pragma omp parallel diff --git a/include/LightGBM/utils/pipeline_reader.h b/include/LightGBM/utils/pipeline_reader.h index 4e07b8b3674..f02500c9751 100644 --- a/include/LightGBM/utils/pipeline_reader.h +++ b/include/LightGBM/utils/pipeline_reader.h @@ -5,6 +5,9 @@ #ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_ #define LIGHTGBM_UTILS_PIPELINE_READER_H_ +#include +#include + #include #include #include @@ -13,9 +16,6 @@ #include #include -#include -#include - namespace LightGBM { /*! diff --git a/include/LightGBM/utils/text_reader.h b/include/LightGBM/utils/text_reader.h index 7aaf7f8153a..638bb268362 100644 --- a/include/LightGBM/utils/text_reader.h +++ b/include/LightGBM/utils/text_reader.h @@ -5,16 +5,16 @@ #ifndef LIGHTGBM_UTILS_TEXT_READER_H_ #define LIGHTGBM_UTILS_TEXT_READER_H_ +#include +#include +#include + #include #include #include #include #include -#include -#include -#include - namespace LightGBM { const size_t kGbs = size_t(1024) * 1024 * 1024; diff --git a/include/LightGBM/utils/threading.h b/include/LightGBM/utils/threading.h index dcf4f7608af..d293fc811eb 100644 --- a/include/LightGBM/utils/threading.h +++ b/include/LightGBM/utils/threading.h @@ -6,14 +6,14 @@ #ifndef LIGHTGBM_UTILS_THREADING_H_ #define LIGHTGBM_UTILS_THREADING_H_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { class Threading { diff --git a/src/application/application.cpp b/src/application/application.cpp index a46cf419c53..1b9eabf8a12 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -2,15 +2,6 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - -#include -#include -#include -#include -#include -#include -#include - #include #include @@ -24,6 +15,14 @@ #include #include +#include +#include +#include +#include +#include +#include +#include + #include "predictor.hpp" #ifdef USE_CUDA diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index ab775d42913..1c56cfa5eb2 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -5,6 +5,12 @@ #ifndef LIGHTGBM_PREDICTOR_HPP_ #define LIGHTGBM_PREDICTOR_HPP_ +#include +#include +#include +#include +#include + #include #include #include @@ -15,12 +21,6 @@ #include #include -#include -#include -#include -#include -#include - namespace LightGBM { /*! diff --git a/src/boosting/dart.hpp b/src/boosting/dart.hpp index b9dca6a78f2..e2481e79772 100644 --- a/src/boosting/dart.hpp +++ b/src/boosting/dart.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_BOOSTING_DART_H_ #define LIGHTGBM_BOOSTING_DART_H_ +#include + #include #include #include #include #include -#include - #include "gbdt.h" #include "score_updater.hpp" diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 6199d726df9..24264c3c175 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -2,13 +2,8 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #include "gbdt.h" -#include -#include -#include - #include #include #include @@ -16,6 +11,10 @@ #include #include +#include +#include +#include + namespace LightGBM { #ifdef USE_CUDA diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 420d5479e04..02476f810a8 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -5,6 +5,12 @@ #ifndef LIGHTGBM_BOOSTING_GBDT_H_ #define LIGHTGBM_BOOSTING_GBDT_H_ +#include +#include +#include +#include +#include + #include #include #include @@ -16,12 +22,6 @@ #include #include -#include -#include -#include -#include -#include - #include "score_updater.hpp" #ifdef USE_CUDA diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 9ac4b269ac1..5ce26bca95c 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -2,17 +2,16 @@ * Copyright (c) 2017 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - -#include -#include -#include - #include #include #include #include #include +#include +#include +#include + #include "gbdt.h" namespace LightGBM { diff --git a/src/boosting/gbdt_prediction.cpp b/src/boosting/gbdt_prediction.cpp index e906bc0aaca..b4711f7c01a 100644 --- a/src/boosting/gbdt_prediction.cpp +++ b/src/boosting/gbdt_prediction.cpp @@ -2,12 +2,12 @@ * Copyright (c) 2017 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include "gbdt.h" - #include #include #include +#include "gbdt.h" + namespace LightGBM { void GBDT::PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const { diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index d3a3c6a344c..2af6dee14f6 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -5,6 +5,10 @@ #ifndef LIGHTGBM_BOOSTING_GOSS_H_ #define LIGHTGBM_BOOSTING_GOSS_H_ +#include +#include +#include + #include #include #include @@ -12,10 +16,6 @@ #include #include -#include -#include -#include - #include "gbdt.h" #include "score_updater.hpp" diff --git a/src/boosting/prediction_early_stop.cpp b/src/boosting/prediction_early_stop.cpp index 7eda08f00d6..7e21141f685 100644 --- a/src/boosting/prediction_early_stop.cpp +++ b/src/boosting/prediction_early_stop.cpp @@ -2,16 +2,15 @@ * Copyright (c) 2017 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#include + +#include #include #include #include #include -#include - -#include - namespace LightGBM { PredictionEarlyStopInstance CreateNone(const PredictionEarlyStopConfig&) { diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 8bbc1a3ebee..e64bf6cb4d8 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -5,6 +5,9 @@ #ifndef LIGHTGBM_BOOSTING_RF_H_ #define LIGHTGBM_BOOSTING_RF_H_ +#include +#include + #include #include #include @@ -12,9 +15,6 @@ #include #include -#include -#include - #include "gbdt.h" #include "score_updater.hpp" diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp index 231de245068..7446691a470 100644 --- a/src/boosting/score_updater.hpp +++ b/src/boosting/score_updater.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_ #define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_ -#include -#include - #include #include #include #include #include +#include +#include + namespace LightGBM { /*! * \brief Used to store and update score for data diff --git a/src/c_api.cpp b/src/c_api.cpp index 979ab104b74..4820690fd7e 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -2,15 +2,6 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - -#include -#include -#include -#include -#include -#include -#include - #include #include diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 9ead232fda0..367edaa3f7b 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -2,18 +2,17 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - -#include -#include -#include -#include - #include #include #include #include +#include +#include +#include +#include + #include "dense_bin.hpp" #include "multi_val_dense_bin.hpp" #include "multi_val_sparse_bin.hpp" diff --git a/src/io/config.cpp b/src/io/config.cpp index 4e4d8dbc794..18c0562a676 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -2,15 +2,14 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - -#include - #include #include #include #include +#include + namespace LightGBM { void Config::KV2Map(std::unordered_map* params, const char* kv) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 1001a9432ce..3d4e29be1fe 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -4,9 +4,12 @@ * license information. */ -#ifdef USE_CUDA -#include -#endif +#include + +#include +#include +#include +#include #include #include @@ -14,12 +17,9 @@ #include #include -#include - -#include -#include -#include -#include +#ifdef USE_CUDA +#include +#endif namespace LightGBM { diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 33ce1df7eb0..c0b2edf1a8c 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,9 +2,6 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - -#include - #include #include @@ -13,6 +10,8 @@ #include #include +#include + namespace LightGBM { using json11::Json; diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 10d988b68e0..99feadf9f7f 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -6,6 +6,8 @@ #ifndef LIGHTGBM_IO_DENSE_BIN_HPP_ #define LIGHTGBM_IO_DENSE_BIN_HPP_ +#include + #include #include #include @@ -16,8 +18,6 @@ #include // LGBM_CUDA -#include - namespace LightGBM { template diff --git a/src/io/file_io.cpp b/src/io/file_io.cpp index 67a719de0a8..a205964287e 100644 --- a/src/io/file_io.cpp +++ b/src/io/file_io.cpp @@ -3,15 +3,14 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ +#include + +#include #include #include #include -#include - -#include - #ifdef USE_HDFS #include #endif diff --git a/src/io/json11.cpp b/src/io/json11.cpp index a3fec7724b5..db21c6aab54 100644 --- a/src/io/json11.cpp +++ b/src/io/json11.cpp @@ -18,16 +18,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include + +#include #include #include #include #include -#include - -#include - namespace json11 { static const int max_depth = 200; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 9b540045650..ea0d5b08def 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -2,13 +2,12 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#include +#include #include #include -#include -#include - namespace LightGBM { Metadata::Metadata() { diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index d7c6599c381..7645530d774 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_ #define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_ +#include +#include + #include #include #include #include -#include -#include - namespace LightGBM { template diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index 09c13420c66..ec3f64a11a0 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_ #define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_ +#include +#include + #include #include #include #include -#include -#include - namespace LightGBM { template diff --git a/src/io/parser.cpp b/src/io/parser.cpp index c30da4305f9..df14ea87a99 100644 --- a/src/io/parser.cpp +++ b/src/io/parser.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#include "parser.hpp" #include #include @@ -10,8 +11,6 @@ #include #include -#include "parser.hpp" - namespace LightGBM { void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt) { diff --git a/src/io/parser.hpp b/src/io/parser.hpp index 43764a68e75..1cfde0635a5 100644 --- a/src/io/parser.hpp +++ b/src/io/parser.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_IO_PARSER_HPP_ #define LIGHTGBM_IO_PARSER_HPP_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { class CSVParser: public Parser { diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index 730ea161a2a..c56cd6da99d 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -6,6 +6,10 @@ #ifndef LIGHTGBM_IO_SPARSE_BIN_HPP_ #define LIGHTGBM_IO_SPARSE_BIN_HPP_ +#include +#include +#include + #include #include #include @@ -13,10 +17,6 @@ #include #include -#include -#include -#include - namespace LightGBM { template diff --git a/src/io/tree.cpp b/src/io/tree.cpp index 759d334ef98..63641311787 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -2,17 +2,16 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - -#include -#include -#include - #include #include #include #include +#include +#include +#include + namespace LightGBM { Tree::Tree(int max_leaves, bool track_branch_features) diff --git a/src/main.cpp b/src/main.cpp index 0a8931ae4ff..ef277ac0c1f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -2,11 +2,10 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#include #include -#include - #include "network/linkers.h" int main(int argc, char** argv) { diff --git a/src/metric/binary_metric.hpp b/src/metric/binary_metric.hpp index 5bde453cdba..00a51d6cd24 100644 --- a/src/metric/binary_metric.hpp +++ b/src/metric/binary_metric.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_METRIC_BINARY_METRIC_HPP_ #define LIGHTGBM_METRIC_BINARY_METRIC_HPP_ +#include +#include +#include + #include #include #include #include -#include -#include -#include - namespace LightGBM { /*! diff --git a/src/metric/dcg_calculator.cpp b/src/metric/dcg_calculator.cpp index cd477612bdc..58843d89f9e 100644 --- a/src/metric/dcg_calculator.cpp +++ b/src/metric/dcg_calculator.cpp @@ -2,14 +2,13 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#include +#include #include #include #include -#include -#include - namespace LightGBM { /*! \brief Declaration for some static members */ diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp index b373a02b56f..18539ee44ee 100644 --- a/src/metric/map_metric.hpp +++ b/src/metric/map_metric.hpp @@ -5,16 +5,16 @@ #ifndef LIGHTGBM_METRIC_MAP_METRIC_HPP_ #define LIGHTGBM_METRIC_MAP_METRIC_HPP_ -#include -#include -#include -#include - #include #include #include #include +#include +#include +#include +#include + namespace LightGBM { class MapMetric:public Metric { diff --git a/src/metric/multiclass_metric.hpp b/src/metric/multiclass_metric.hpp index f60588cac3f..59548cd3a79 100644 --- a/src/metric/multiclass_metric.hpp +++ b/src/metric/multiclass_metric.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_METRIC_MULTICLASS_METRIC_HPP_ #define LIGHTGBM_METRIC_MULTICLASS_METRIC_HPP_ +#include +#include + #include #include #include #include -#include -#include - namespace LightGBM { /*! * \brief Metric for multiclass task. diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index d9227502009..3b3afb547eb 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_METRIC_RANK_METRIC_HPP_ #define LIGHTGBM_METRIC_RANK_METRIC_HPP_ -#include -#include -#include - #include #include #include #include +#include +#include +#include + namespace LightGBM { class NDCGMetric:public Metric { diff --git a/src/metric/regression_metric.hpp b/src/metric/regression_metric.hpp index 3085bc941b3..4d1a3662142 100644 --- a/src/metric/regression_metric.hpp +++ b/src/metric/regression_metric.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_METRIC_REGRESSION_METRIC_HPP_ #define LIGHTGBM_METRIC_REGRESSION_METRIC_HPP_ +#include +#include + #include #include #include #include -#include -#include - namespace LightGBM { /*! * \brief Metric for regression task. diff --git a/src/metric/xentropy_metric.hpp b/src/metric/xentropy_metric.hpp index 1b86e60e640..bec611d28e5 100644 --- a/src/metric/xentropy_metric.hpp +++ b/src/metric/xentropy_metric.hpp @@ -5,16 +5,16 @@ #ifndef LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_ #define LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_ -#include -#include -#include -#include - #include #include #include #include +#include +#include +#include +#include + /* * Implements three related metrics: * diff --git a/src/network/linker_topo.cpp b/src/network/linker_topo.cpp index 102fdc993cd..1d7b2990f0e 100644 --- a/src/network/linker_topo.cpp +++ b/src/network/linker_topo.cpp @@ -2,15 +2,14 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#include +#include +#include #include #include #include -#include -#include -#include - namespace LightGBM { diff --git a/src/network/linkers.h b/src/network/linkers.h index 5a91d40f73d..8ddbb902a15 100644 --- a/src/network/linkers.h +++ b/src/network/linkers.h @@ -5,6 +5,11 @@ #ifndef LIGHTGBM_NETWORK_LINKERS_H_ #define LIGHTGBM_NETWORK_LINKERS_H_ +#include +#include +#include +#include + #include #include #include @@ -13,11 +18,6 @@ #include #include -#include -#include -#include -#include - #ifdef USE_SOCKET #include "socket_wrapper.hpp" #endif diff --git a/src/network/linkers_socket.cpp b/src/network/linkers_socket.cpp index 708e3ae9bad..40c6de7aab6 100644 --- a/src/network/linkers_socket.cpp +++ b/src/network/linkers_socket.cpp @@ -4,6 +4,10 @@ */ #ifdef USE_SOCKET +#include +#include +#include + #include #include #include @@ -12,10 +16,6 @@ #include #include -#include -#include -#include - #include "linkers.h" namespace LightGBM { diff --git a/src/network/network.cpp b/src/network/network.cpp index cbc241d88c6..3976d72a692 100644 --- a/src/network/network.cpp +++ b/src/network/network.cpp @@ -2,14 +2,13 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - -#include -#include - #include #include +#include +#include + #include "linkers.h" namespace LightGBM { diff --git a/src/network/socket_wrapper.hpp b/src/network/socket_wrapper.hpp index 87bd88f934a..70f9586b99c 100644 --- a/src/network/socket_wrapper.hpp +++ b/src/network/socket_wrapper.hpp @@ -6,13 +6,13 @@ #define LIGHTGBM_NETWORK_SOCKET_WRAPPER_HPP_ #ifdef USE_SOCKET +#include + #include #include #include #include -#include - #if defined(_WIN32) #ifdef _MSC_VER diff --git a/src/objective/binary_objective.hpp b/src/objective/binary_objective.hpp index dff3a9c1097..4861bd1b83f 100644 --- a/src/objective/binary_objective.hpp +++ b/src/objective/binary_objective.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_OBJECTIVE_BINARY_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_BINARY_OBJECTIVE_HPP_ +#include +#include + #include #include #include #include #include -#include -#include - namespace LightGBM { /*! * \brief Objective function for binary classification diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp index da604baeb49..c133e1f75fd 100644 --- a/src/objective/multiclass_objective.hpp +++ b/src/objective/multiclass_objective.hpp @@ -5,6 +5,9 @@ #ifndef LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_ +#include +#include + #include #include #include @@ -12,9 +15,6 @@ #include #include -#include -#include - #include "binary_objective.hpp" namespace LightGBM { diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 730d358f030..1b20721e238 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -6,6 +6,9 @@ #ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ +#include +#include + #include #include #include @@ -14,9 +17,6 @@ #include #include -#include -#include - namespace LightGBM { /*! diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp index fe391f2c15f..53fa7020be5 100644 --- a/src/objective/regression_objective.hpp +++ b/src/objective/regression_objective.hpp @@ -5,14 +5,14 @@ #ifndef LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_ -#include -#include -#include - #include #include #include +#include +#include +#include + namespace LightGBM { #define PercentileFun(T, data_reader, cnt_data, alpha) \ diff --git a/src/objective/xentropy_objective.hpp b/src/objective/xentropy_objective.hpp index 3a145bd2cbb..cad2b8faafd 100644 --- a/src/objective/xentropy_objective.hpp +++ b/src/objective/xentropy_objective.hpp @@ -5,16 +5,16 @@ #ifndef LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_ +#include +#include +#include + #include #include #include #include #include -#include -#include -#include - /* * Implements gradients and hessians for the following point losses. * Target y is anything in interval [0, 1]. diff --git a/src/treelearner/col_sampler.hpp b/src/treelearner/col_sampler.hpp index 4876827eb39..cd288481255 100644 --- a/src/treelearner/col_sampler.hpp +++ b/src/treelearner/col_sampler.hpp @@ -6,9 +6,6 @@ #ifndef LIGHTGBM_TREELEARNER_COL_SAMPLER_HPP_ #define LIGHTGBM_TREELEARNER_COL_SAMPLER_HPP_ -#include -#include - #include #include #include diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp index fda2bffc820..fad966e6487 100644 --- a/src/treelearner/cost_effective_gradient_boosting.hpp +++ b/src/treelearner/cost_effective_gradient_boosting.hpp @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_ #define LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_ -#include - #include #include #include #include +#include + #include "data_partition.hpp" #include "serial_tree_learner.h" #include "split_info.hpp" diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp index bdae960c1d9..01c5d2606e7 100644 --- a/src/treelearner/data_partition.hpp +++ b/src/treelearner/data_partition.hpp @@ -5,15 +5,15 @@ #ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_ #define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_ -#include -#include -#include - #include #include #include #include +#include +#include +#include + namespace LightGBM { /*! * \brief DataPartition is used to store the the partition of data on tree. diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 4dd5f29af0a..8916ee48fd4 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -6,6 +6,10 @@ #ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ +#include +#include +#include + #include #include #include @@ -13,10 +17,6 @@ #include #include -#include -#include -#include - #include "monotone_constraints.hpp" #include "split_info.hpp" diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index 42967e138c7..fad02e1c044 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -6,12 +6,12 @@ #include "gpu_tree_learner.h" -#include - #include #include #include +#include + #include "../io/dense_bin.hpp" #define GPU_DEBUG 0 diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h index c0607cbe4a2..598e8d40ac9 100644 --- a/src/treelearner/gpu_tree_learner.h +++ b/src/treelearner/gpu_tree_learner.h @@ -5,6 +5,12 @@ #ifndef LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_ #define LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_ +#include +#include +#include +#include +#include + #include #include #include @@ -12,12 +18,6 @@ #include #include -#include -#include -#include -#include -#include - #include "data_partition.hpp" #include "feature_histogram.hpp" #include "leaf_splits.hpp" diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 5c94846eb7b..b0a753eafe1 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -5,11 +5,11 @@ #ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_ #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_ +#include + #include #include -#include - #include "data_partition.hpp" namespace LightGBM { diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 5fbad7352e0..2fdf542d421 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -5,13 +5,13 @@ #ifndef LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_ #define LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_ +#include +#include + #include #include #include -#include -#include - #include "gpu_tree_learner.h" #include "serial_tree_learner.h" #include "cuda_tree_learner.h" diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 9aac1bdf1fd..e5b6626a6bd 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -4,16 +4,16 @@ */ #include "serial_tree_learner.h" -#include -#include -#include -#include - #include #include #include #include +#include +#include +#include +#include + #include "cost_effective_gradient_boosting.hpp" namespace LightGBM { diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 6fc0c2cb968..367c262192c 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -5,13 +5,6 @@ #ifndef LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_ #define LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_ -#include -#include -#include -#include -#include -#include - #include #include #include @@ -19,6 +12,13 @@ #include #include +#include +#include +#include +#include +#include +#include + #include "col_sampler.hpp" #include "data_partition.hpp" #include "feature_histogram.hpp" diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp index 72dd3fa324c..492434d5160 100644 --- a/src/treelearner/split_info.hpp +++ b/src/treelearner/split_info.hpp @@ -5,6 +5,8 @@ #ifndef LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_ #define LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_ +#include + #include #include #include @@ -12,8 +14,6 @@ #include #include -#include - namespace LightGBM { /*! diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 043bf0e11b3..58f5b88d6b0 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -2,13 +2,12 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#include #include #include #include -#include - #include "parallel_tree_learner.h" namespace LightGBM { From 55f24ccbb3b3d2e8f598f8d2616c217f556e7084 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 11 Jun 2020 19:11:12 +0000 Subject: [PATCH 067/119] Missing change from previous rebase --- src/c_api.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/c_api.cpp b/src/c_api.cpp index 4820690fd7e..38957a13fc2 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -642,12 +642,6 @@ const char* LGBM_GetLastError() { return LastErrorMsg(); } -int LGBM_RegisterLogCallback(void (*callback)(const char*)) { - API_BEGIN(); - Log::ResetCallBack(callback); - API_END(); -} - int LGBM_GetDeviceType() { #ifdef USE_GPU return 1; From 8e028f3319ed2819814b39e28facaa33b569cba3 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 11 Jun 2020 19:38:52 +0000 Subject: [PATCH 068/119] Minor cleanup and removal of development scripts. --- CMakeLists.txt | 8 ++------ build_LGBM.232.sh | 7 ------- install_LGBM.232.sh | 7 ------- src/boosting/gbdt.cpp | 5 ----- test_LGBM.232.sh | 5 ----- 5 files changed, 2 insertions(+), 30 deletions(-) delete mode 100755 build_LGBM.232.sh delete mode 100755 install_LGBM.232.sh delete mode 100755 test_LGBM.232.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 7bcd068f3ca..3e237da69c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,9 +173,6 @@ if(USE_CUDA) -DIGNORE_INDICES ) - #string(REPLACE ";" " " BASE_DEFINES "${BASE_DEFINES}") - #string(REPLACE ";" " " ALLFEATS_DEFINES "${ALLFEATS_DEFINES}") - #string(REPLACE ";" " " FULLDATA_DEFINES "${FULLDATA_DEFINES}") message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}) message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES}) @@ -193,7 +190,6 @@ if(USE_CUDA) ) endfunction() - #foreach (hsize 16 64 256) foreach (hsize _16_64_256) add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}") add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}") @@ -310,9 +306,9 @@ file(GLOB SOURCES src/objective/*.cpp src/network/*.cpp src/treelearner/*.cpp -#ifdef USE_CUDA +if(USE_CUDA) src/treelearner/*.cu -#endif +endif(USE_CUDA) ) add_executable(lightgbm src/main.cpp ${SOURCES}) diff --git a/build_LGBM.232.sh b/build_LGBM.232.sh deleted file mode 100755 index f785d6556e6..00000000000 --- a/build_LGBM.232.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/bash -rm -rf build -mkdir build -cd build -cmake -DUSE_CUDA=1 .. -#cmake .. -make -j40 diff --git a/install_LGBM.232.sh b/install_LGBM.232.sh deleted file mode 100755 index 7af586f4722..00000000000 --- a/install_LGBM.232.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/bash -cd python-package -python setup.py bdist_wheel -pip uninstall -y lightgbm -cd dist -pip install lightgbm-*.whl -cd ../.. diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 24264c3c175..854e2af240c 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -68,11 +68,6 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective if (config_->device_type == std::string("cuda")) { // LGBM_config_::current_device=lgbm_device_cuda; moved to application.cpp LGBM_config_::current_learner=use_cuda_learner; - - /* Following are needed to ensure bagging required by the CUDA implementation */ -// if (config_->bagging_fraction == 1.0){config_->bagging_fraction = 0.8;} moved to application.cpp -// if (config_->bagging_freq == 0) {config_->bagging_freq = 1;} moved to application.cpp - } #endif diff --git a/test_LGBM.232.sh b/test_LGBM.232.sh deleted file mode 100755 index cd5146f959f..00000000000 --- a/test_LGBM.232.sh +++ /dev/null @@ -1,5 +0,0 @@ -python -m unittest tests/python_package_test/test_basic.py -python -m unittest tests/python_package_test/test_consistency.py -python -m unittest tests/python_package_test/test_engine.py -python -m unittest tests/python_package_test/test_plotting.py -python -m unittest tests/python_package_test/test_sklearn.py From f4725e1a1307a565422152f1411ee5f598fa1371 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 15 Jun 2020 17:20:04 +0000 Subject: [PATCH 069/119] Only set gpu_use_dp on by default for CUDA. Other minor change. --- src/boosting/gbdt.cpp | 5 ++++- src/io/config_auto.cpp | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 854e2af240c..546da0c0cbe 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -270,7 +270,10 @@ void GBDT::Bagging(int iter) { tmp_hessians_.resize(total_size); } - tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); + tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), + bag_data_cnt_, false); + tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), + bag_data_cnt_); tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_); } diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 9408a97c70f..b0cd57deb69 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -613,9 +613,9 @@ void Config::GetMembersFromString(const std::unordered_map 0); From 0e84c152869d0d5b9b6440718cb9dbc770e7b874 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 15 Jun 2020 18:03:32 +0000 Subject: [PATCH 070/119] Fix python lint indentation problem. --- tests/python_package_test/test_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 51ab32a239b..791ef94a4be 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -991,7 +991,7 @@ def train_and_get_predictions(features, labels): 'min_data': 5, } if lgb.get_device_type() == 2: - lgb_params["device"] = "cuda" + lgb_params["device"] = "cuda" gbm = lgb.train( params=lgb_params, train_set=dataset, @@ -1723,10 +1723,10 @@ def train_booster(params=params_obj_verbose, **kwargs): params_obj_class_3_verbose["device"] = "cuda" params_obj_class_1_verbose = {'objective': obj_multi_alias, 'num_class': 1, 'verbose': -1} if lgb.get_device_type() == 2: - params_obj_class_1_verbose["device"] = "cuda" + params_obj_class_1_verbose["device"] = "cuda" params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1} if lgb.get_device_type() == 2: - params_obj_verbose["device"] = "cuda" + params_obj_verbose["device"] = "cuda" # multiclass default metric res = get_cv_result(params_obj_class_3_verbose) self.assertEqual(len(res), 2) From ccf7602e7cb824d26128801f0329589524e8d482 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 15 Jun 2020 18:36:53 +0000 Subject: [PATCH 071/119] More python lint issues. --- python-package/setup.py | 2 +- tests/python_package_test/test_engine.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python-package/setup.py b/python-package/setup.py index eca56783713..1e0500f180c 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -225,7 +225,7 @@ def initialize_options(self): self.openmp_library = None self.mpi = 0 self.hdfs = 0 - #self.precompile = 0 #TODO: revert this + # self.precompile = 0 # TODO: revert this self.precompile = 1 self.nomp = 0 self.bit32 = 0 diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 791ef94a4be..b5de6a9a4c7 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1429,6 +1429,7 @@ def test_metrics(self): def get_cv_result(params=params_obj_verbose, **kwargs): return lgb.cv(params, lgb_train, num_boost_round=2, verbose_eval=False, **kwargs) + def train_booster(params=params_obj_verbose, **kwargs): lgb.train(params, lgb_train, num_boost_round=2, From c41771585212b2535673b96f3024633224a9e97d Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 15 Jun 2020 21:35:40 +0000 Subject: [PATCH 072/119] Big lint cleanup - more to come. --- include/LightGBM/application.h | 2 +- include/LightGBM/cuda/cuda_utils.h | 14 +- include/LightGBM/cuda/vector_cudahost.h | 71 ++- include/LightGBM/tree_learner.h | 2 +- include/LightGBM/utils/common.h | 2 - src/application/application.cpp | 11 +- src/boosting/gbdt.cpp | 20 +- src/boosting/gbdt.h | 10 +- src/c_api.cpp | 13 +- src/io/config.cpp | 2 +- src/io/dataset.cpp | 8 +- src/io/dense_bin.hpp | 18 +- src/treelearner/cuda_kernel_launcher.cu | 4 +- src/treelearner/cuda_kernel_launcher.h | 50 +- src/treelearner/cuda_tree_learner.cpp | 257 +++++----- src/treelearner/cuda_tree_learner.h | 484 +++++++++--------- .../data_parallel_tree_learner.cpp | 6 +- .../feature_parallel_tree_learner.cpp | 6 +- src/treelearner/gpu_tree_learner.cpp | 2 +- src/treelearner/gpu_tree_learner.h | 2 +- .../kernels/histogram_16_64_256.cu | 32 +- .../kernels/histogram_16_64_256.hu | 4 +- src/treelearner/parallel_tree_learner.h | 6 +- src/treelearner/serial_tree_learner.cpp | 8 +- src/treelearner/serial_tree_learner.h | 10 +- src/treelearner/tree_learner.cpp | 4 +- .../voting_parallel_tree_learner.cpp | 6 +- 27 files changed, 513 insertions(+), 541 deletions(-) diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h index 911dedd7d94..7ce8956a555 100644 --- a/include/LightGBM/application.h +++ b/include/LightGBM/application.h @@ -38,7 +38,7 @@ class Application { // LGBM_CUDA /*! \brief call to get configuration */ - Config GetConfig() {return config_ ;} ; + Config GetConfig() {return config_ ;} private: /*! \brief Load parameters from command line and config file*/ diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index e57d3746a21..7ff7b28c8f1 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -5,7 +5,7 @@ #ifndef LGBM_CUDA_UTILS_H #define LGBM_CUDA_UTILS_H -//LGBM_CUDA +// LGBM_CUDA #ifdef USE_CUDA @@ -14,13 +14,11 @@ #include #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) { + LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } } #endif /* USE_CUDA */ diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 61d6e464970..b964fa4ad1f 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -9,7 +9,7 @@ #include #include -//LGBM_CUDA +// LGBM_CUDA namespace LightGBM { @@ -22,60 +22,55 @@ namespace LightGBM { #define use_cuda_learner 2 class LGBM_config_ { - public: - static int current_device; // Default: lgbm_device_cpu - static int current_learner; // Default: use_cpu_learner + public: + static int current_device; // Default: lgbm_device_cpu + static int current_learner; // Default: use_cpu_learner }; -} // namespace LightGBM +} // namespace LightGBM template struct CHAllocator { - typedef T value_type; - CHAllocator() {} - template CHAllocator(const CHAllocator& other); - T* allocate(std::size_t n) - { - T* ptr; - if (n == 0) return NULL; - #ifdef USE_CUDA - if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){ - cudaError_t ret= cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); - if (ret != cudaSuccess){ -fprintf(stderr, " TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr); - ptr = (T*) malloc(n*sizeof(T)); - } - } - else{ - ptr = (T*) malloc(n*sizeof(T)); + typedef T value_type; + CHAllocator() {} + template CHAllocator(const CHAllocator& other); + T* allocate(std::size_t n) { + T* ptr; + if (n == 0) return NULL; + #ifdef USE_CUDA + if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) { + cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); + if (ret != cudaSuccess) { + fprintf(stderr, " TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr); + ptr = reinterpret_cast(malloc(n*sizeof(T))); + } + } else { + ptr = reinterpret_cast(malloc(n*sizeof(T))); } #else - ptr = (T*) malloc(n*sizeof(T)); + ptr = reinterpret_cast(malloc(n*sizeof(T))); #endif - return ptr; - } + return ptr; + } - void deallocate(T* p, std::size_t n) - { + void deallocate(T* p, std::size_t n) { (void)n; // UNUSED - if (p==NULL) return; + if (p == NULL) return; #ifdef USE_CUDA - if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda){ - cudaPointerAttributes attributes; - cudaPointerGetAttributes (&attributes, p); - if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)){ - cudaFreeHost(p); - } - } - else{ + if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) { + cudaPointerAttributes attributes; + cudaPointerGetAttributes (&attributes, p); + if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { + cudaFreeHost(p); + } + } else { free(p); } #else - free(p); + free(p); #endif } - }; template bool operator==(const CHAllocator&, const CHAllocator&); diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index 6c549a5ed71..2ea30ac63b2 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -57,7 +57,7 @@ class TreeLearner { * \param is_constant_hessian True if all hessians share the same value * \return A trained tree */ - virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, Json& forced_split_json) = 0; + virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, const Json& forced_split_json) = 0; /*! * \brief use an existing tree to fit the new gradients and hessians. diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 0e26ee84faa..bdc769e5222 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -30,8 +30,6 @@ #include #pragma intrinsic(_BitScanReverse) #endif -#include -#include #if defined(_MSC_VER) #include diff --git a/src/application/application.cpp b/src/application/application.cpp index 1b9eabf8a12..5c61b323654 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -43,17 +43,16 @@ Application::Application(int argc, char** argv) { Log::Fatal("No training/prediction data, application quit"); } -//LGBM_CUDA +// LGBM_CUDA #ifdef USE_CUDA - if (config_.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + if (config_.device_type == std::string("cuda")) { + LightGBM::LGBM_config_::current_device = lgbm_device_cuda; config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config_.bagging_fraction == 1.0){config_.bagging_fraction = 0.8;} - if (config_.bagging_freq == 0) {config_.bagging_freq = 1;} + if (config_.bagging_fraction == 1.0) { config_.bagging_fraction = 0.8; } + if (config_.bagging_freq == 0) { config_.bagging_freq = 1; } } #endif - } Application::~Application() { diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 546da0c0cbe..d102f6aedc1 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -18,8 +18,8 @@ namespace LightGBM { #ifdef USE_CUDA -int LGBM_config_::current_device=lgbm_device_cpu; -int LGBM_config_::current_learner=use_cpu_learner; +int LGBM_config_::current_device = lgbm_device_cpu; +int LGBM_config_::current_learner = use_cpu_learner; #endif GBDT::GBDT() @@ -66,8 +66,8 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective // LGBM_CUDA #ifdef USE_CUDA if (config_->device_type == std::string("cuda")) { - // LGBM_config_::current_device=lgbm_device_cuda; moved to application.cpp - LGBM_config_::current_learner=use_cuda_learner; + // LGBM_config_::current_device = lgbm_device_cuda; moved to application.cpp + LGBM_config_::current_learner = use_cuda_learner; } #endif @@ -260,7 +260,7 @@ void GBDT::Bagging(int iter) { // set bagging data to tree learner if (!is_use_subset_) { tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); - } else { // LGBM_CUDA + } else { // LGBM_CUDA // NEW get subset bool resized= tmp_subset_->ReSize(bag_data_cnt_); @@ -284,7 +284,7 @@ void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { Common::FunctionTimer fun_timer("GBDT::Train", global_timer); bool is_finished = false; - //LGBM_CUDA + // LGBM_CUDA auto start_time = std::chrono::steady_clock::now(); for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) { @@ -437,8 +437,8 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { #pragma omp parallel for schedule(static) // LGBM_CUDA for (int i = 0; i < bag_data_cnt_; ++i) { - tmp_grad[i] = grad[bag_data_indices_[i]]; // LGBM_CUDA - tmp_hess[i] = hess[bag_data_indices_[i]]; // LGBM_CUDA + tmp_grad[i] = grad[bag_data_indices_[i]]; // LGBM_CUDA + tmp_hess[i] = hess[bag_data_indices_[i]]; // LGBM_CUDA } } @@ -509,7 +509,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { - if (config_->device_type == std::string("cuda")){ //LGBM_CUDA + if (config_->device_type == std::string("cuda")) { // LGBM_CUDA return TrainOneIterCUDA(gradients, hessians); } @@ -965,7 +965,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { } } else { bag_data_cnt_ = num_data_; - if (config_->device_type == std::string("cuda")){ // LGBM_CUDA + if (config_->device_type == std::string("cuda")) { // LGBM_CUDA if (tmp_subset_ == nullptr){ tmp_subset_.reset(new Dataset(bag_data_cnt_)); tmp_subset_->CopyFeatureMapperFrom(train_data_); diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 02476f810a8..99bf64a6fb0 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -25,7 +25,7 @@ #include "score_updater.hpp" #ifdef USE_CUDA -#include //LGBM_CUDA +#include // LGBM_CUDA #endif namespace LightGBM { @@ -478,11 +478,11 @@ class GBDT : public GBDTBase { #ifdef USE_CUDA /*! \brief First order derivative of training data */ - std::vector> gradients_; // LGBM_CUDA - std::vector> tmp_gradients_; // LGBM_CUDA + std::vector> gradients_; // LGBM_CUDA + std::vector> tmp_gradients_; // LGBM_CUDA /*! \brief Second order derivative of training data */ - std::vector> hessians_; // LGBM_CUDA - std::vector> tmp_hessians_; // LGBM_CUDA + std::vector> hessians_; // LGBM_CUDA + std::vector> tmp_hessians_; // LGBM_CUDA #else /*! \brief First order derivative of training data */ std::vector> gradients_; diff --git a/src/c_api.cpp b/src/c_api.cpp index 38957a13fc2..0ce92342fb6 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -43,16 +43,15 @@ inline int LGBM_APIHandleException(const std::string& ex) { return -1; } -//LGBM_CUDA -inline void AdditionalConfig(Config *config) -{ +// LGBM_CUDA +inline void AdditionalConfig(Config *config) { #ifdef USE_CUDA - if (config->device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + if (config->device_type == std::string("cuda")) { + LightGBM::LGBM_config_::current_device = lgbm_device_cuda; config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config->bagging_fraction == 1.0){config->bagging_fraction = 0.8;} - if (config->bagging_freq == 0) {config->bagging_freq = 1;} + if (config->bagging_fraction == 1.0) { config->bagging_fraction = 0.8; } + if (config->bagging_freq == 0) { config->bagging_freq = 1; } } #else (void)(config); // UNUSED diff --git a/src/io/config.cpp b/src/io/config.cpp index 18c0562a676..ed643204c91 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -126,7 +126,7 @@ void GetDeviceType(const std::unordered_map& params, s *device_type = "cpu"; } else if (value == std::string("gpu")) { *device_type = "gpu"; - } else if (value == std::string("cuda")) { // LGBM_CUDA + } else if (value == std::string("cuda")) { // LGBM_CUDA *device_type = "cuda"; } else { Log::Fatal("Unknown device type %s", value.c_str()); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 3d4e29be1fe..df18ef6f838 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -345,16 +345,16 @@ void Dataset::Construct(std::vector>* bin_mappers, } auto features_in_group = NoGroup(used_features); -//LGBM_CUDA +// LGBM_CUDA #ifdef USE_CUDA - if (io_config.device_type == std::string("cuda")){ - LightGBM::LGBM_config_::current_device=lgbm_device_cuda; + if (io_config.device_type == std::string("cuda")) { + LightGBM::LGBM_config_::current_device = lgbm_device_cuda; } #endif std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { - bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); // LGBM_CUDA + bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); // LGBM_CUDA features_in_group = FastFeatureBundling(*bin_mappers, sample_non_zero_indices, sample_values, diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 99feadf9f7f..89f29a99bdc 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -13,10 +13,10 @@ #include #ifdef USE_CUDA -#include // LGBM_CUDA +#include // LGBM_CUDA #endif -#include // LGBM_CUDA +#include // LGBM_CUDA namespace LightGBM { @@ -368,7 +368,7 @@ class DenseBin : public Bin { data_size_t num_data() const override { return num_data_; } - // LGBM_CUDA + // LGBM_CUDA void* get_data() override { return data_.data(); } void FinishLoad() override { @@ -464,16 +464,16 @@ class DenseBin : public Bin { DenseBin* Clone() override; private: - data_size_t num_data_; + data_size_t num_data_; #ifdef USE_CUDA - std::vector> data_; // LGBM_CUDA + std::vector> data_; // LGBM_CUDA #else - std::vector> data_; + std::vector> data_; #endif - std::vector buf_; + std::vector buf_; - DenseBin(const DenseBin& other) - : num_data_(other.num_data_), data_(other.data_) {} + DenseBin(const DenseBin& other) + : num_data_(other.num_data_), data_(other.data_) {} }; template diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu index dad8b6c563b..6e3149dae06 100644 --- a/src/treelearner/cuda_kernel_launcher.cu +++ b/src/treelearner/cuda_kernel_launcher.cu @@ -8,7 +8,7 @@ using namespace LightGBM; void cuda_histogram( - int histogram_size, + int histogram_size, data_size_t leaf_num_data, data_size_t num_data, bool use_all_features, @@ -25,7 +25,7 @@ score_t arg6_const, char* arg7, volatile int* arg8, - void* arg9, + void* arg9, size_t exp_workgroups_per_feature) { if (histogram_size == 16) { diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h index 1241a9cafb9..efe8e4b0d4a 100644 --- a/src/treelearner/cuda_kernel_launcher.h +++ b/src/treelearner/cuda_kernel_launcher.h @@ -1,9 +1,13 @@ +/*! + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ #ifndef LGBM_KERNEL_LAUNCHER #define LGBM_KERNEL_LAUNCHER #ifdef USE_CUDA // what should I include?? -#include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t +#include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t #include struct ThreadData { @@ -19,9 +23,9 @@ struct ThreadData { cudaStream_t stream; uint8_t* device_features; uint8_t* device_feature_masks; - //data_size_t num_data; + // data_size_t num_data; data_size_t* device_data_indices; - //data_size_t leaf_num_data; + // data_size_t leaf_num_data; score_t* device_gradients; score_t* device_hessians; score_t hessians_const; @@ -41,26 +45,26 @@ struct ThreadData { void cuda_histogram( - int histogram_size, - data_size_t leaf_num_data, - data_size_t num_data, - bool use_all_features, - bool is_constant_hessian, - int num_workgroups, - cudaStream_t stream, - uint8_t* arg0, - uint8_t* arg1, - data_size_t arg2, - data_size_t* arg3, - data_size_t arg4, - score_t* arg5, - score_t* arg6, - score_t arg6_const, - char* arg7, - volatile int* arg8, - void* arg9, - size_t exp_workgroups_per_feature); + int histogram_size, + data_size_t leaf_num_data, + data_size_t num_data, + bool use_all_features, + bool is_constant_hessian, + int num_workgroups, + cudaStream_t stream, + uint8_t* arg0, + uint8_t* arg1, + data_size_t arg2, + data_size_t* arg3, + data_size_t arg4, + score_t* arg5, + score_t* arg6, + score_t arg6_const, + char* arg7, + volatile int* arg8, + void* arg9, + size_t exp_workgroups_per_feature); -#endif //USE_CUDA +#endif // USE_CUDA #endif // LGBM_KERNEL_LAUNCHER diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index f45319ae818..0588ad14fe0 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -1,4 +1,13 @@ +/*! + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ #ifdef USE_CUDA +#include +#include + +#include + #include "cuda_tree_learner.h" #include "../io/dense_bin.hpp" @@ -6,20 +15,15 @@ #include #include -#include -#include - #include -#define cudaMemcpy_DEBUG 0 // 1: DEBUG cudaMemcpy -#define ResetTrainingData_DEBUG 0 // 1: Debug ResetTrainingData - -#include +#define cudaMemcpy_DEBUG 0 // 1: DEBUG cudaMemcpy +#define ResetTrainingData_DEBUG 0 // 1: Debug ResetTrainingData #define GPU_DEBUG 0 static void *launch_cuda_histogram(void *thread_data) { - ThreadData td = *(ThreadData*)thread_data; + ThreadData td = *(reinterpret_cast(thread_data)); int device_id = td.device_id; CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); @@ -54,7 +58,7 @@ CUDATreeLearner::CUDATreeLearner(const Config* config) :SerialTreeLearner(config) { use_bagging_ = false; nthreads_ = 0; - if(config->gpu_use_dp && USE_DP_FLOAT) Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!"); + if (config->gpu_use_dp && USE_DP_FLOAT) Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!"); else Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!"); } @@ -75,7 +79,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, is_use_subset_ = is_use_subset; // Initialize GPU buffers and kernels & LGBM_CUDA: get device info - InitGPU(config_->num_gpu); // LGBM_CUDA + InitGPU(config_->num_gpu); // LGBM_CUDA } // some functions used for debugging the GPU histogram construction @@ -104,7 +108,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d int i; int retval = 0; printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size); - if (dp_flag) { // double precision + if (dp_flag) { // double precision double af, bf; long long int ai, bi; for (i = 0; i < (int) size; ++i) { @@ -121,8 +125,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi); ++retval; } - } - else { + } else { af = GET_HESS(h1, i); bf = GET_HESS(h2, i); if (((std::fabs(af - bf))/af) >= 1e-6) { @@ -131,8 +134,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d } } } - } - else { // single precision + } else { // single precision float af, bf; int ai, bi; for (i = 0; i < (int) size; ++i) { @@ -149,8 +151,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi); ++retval; } - } - else { + } else { af = GET_HESS(h1, i); bf = GET_HESS(h2, i); if (((std::fabs(af - bf))/af) >= 1e-5) { @@ -199,7 +200,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu std::vector num_gpu_workgroups; ThreadData *thread_data = (ThreadData*)malloc(sizeof(ThreadData) * num_gpu_); - for(int device_id = 0; device_id < num_gpu_; ++device_id) { + for (int device_id = 0; device_id < num_gpu_; ++device_id) { int num_gpu_feature_groups = num_gpu_feature_groups_[device_id]; int num_workgroups = (1 << exp_workgroups_per_feature) * num_gpu_feature_groups; num_gpu_workgroups.push_back(num_workgroups); @@ -213,8 +214,8 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu num_workgroups, exp_workgroups_per_feature); } - for(int device_id = 0; device_id < num_gpu_; ++device_id) { - if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))){ + for (int device_id = 0; device_id < num_gpu_; ++device_id) { + if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))) { fprintf(stderr, "Error in creating threads. Exiting\n"); exit(0); } @@ -222,14 +223,14 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu /* Wait for the threads to finish */ - for(int device_id = 0; device_id < num_gpu_; ++device_id) { - if (pthread_join(*(cpu_threads_[device_id]), NULL)){ + for (int device_id = 0; device_id < num_gpu_; ++device_id) { + if (pthread_join(*(cpu_threads_[device_id]), NULL)) { fprintf(stderr, "Error in joining threads. Exiting\n"); exit(0); } } - for(int device_id = 0; device_id < num_gpu_; ++device_id) { + for (int device_id = 0; device_id < num_gpu_; ++device_id) { // copy the results asynchronously. Size depends on if double precision is used @@ -247,7 +248,7 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra HistType* hist_outputs = (HistType*) host_histogram_outputs_; #pragma omp parallel for schedule(static, num_gpu_) - for(int device_id = 0; device_id < num_gpu_; ++device_id) { + for (int device_id = 0; device_id < num_gpu_; ++device_id) { // auto start_time = std::chrono::steady_clock::now(); @@ -257,7 +258,7 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra } #pragma omp parallel for schedule(static) - for(int i = 0; i < num_dense_feature_groups_; ++i) { + for (int i = 0; i < num_dense_feature_groups_; ++i) { if (!feature_masks_[i]) { continue; } @@ -286,13 +287,11 @@ void CUDATreeLearner::CountDenseFeatureGroups() { if (!num_dense_feature_groups_) { Log::Warning("GPU acceleration is disabled because no non-trival dense features can be found"); } - } // LGBM_CUDA void CUDATreeLearner::prevAllocateGPUMemory() { - // how many feature-group tuples we have // leave some safe margin for prefetching // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature @@ -317,7 +316,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { int offset = 0; - for(int i = 0; i < num_gpu_; ++i) { + for (int i = 0; i < num_gpu_; ++i) { offset_gpu_feature_groups_.at(i) = offset; num_gpu_feature_groups_.at(i) = (i < remain_features)? num_features_per_gpu + 1 : num_features_per_gpu; offset += num_gpu_feature_groups_.at(i); @@ -329,7 +328,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { cudaPointerAttributes attributes; cudaPointerGetAttributes (&attributes, feature_masks_.data()); - if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)){ + if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data())); } } @@ -343,7 +342,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_); // histogram bin entry size depends on the precision (single/double) - hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t)); // two elements in this "size" + hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t)); // two elements in this "size" // host_size histogram outputs // host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_); @@ -360,7 +359,7 @@ void CUDATreeLearner::AllocateGPUMemory() { #pragma omp parallel for schedule(static, num_gpu_) - for(int device_id = 0; device_id < num_gpu_; ++device_id) { + for (int device_id = 0; device_id < num_gpu_; ++device_id) { // do nothing it there is no gpu feature int num_gpu_feature_groups = num_gpu_feature_groups_[device_id]; if (num_gpu_feature_groups) { @@ -377,15 +376,15 @@ void CUDATreeLearner::AllocateGPUMemory() { // allocate space for gradients and hessians on device // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed - if (device_gradients_[device_id] != NULL){ + if (device_gradients_[device_id] != NULL) { CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id])); } - if (device_hessians_[device_id] != NULL){ + if (device_hessians_[device_id] != NULL) { CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id])); } - if (device_feature_masks_[device_id] != NULL){ + if (device_feature_masks_[device_id] != NULL) { CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id])); } @@ -396,7 +395,7 @@ void CUDATreeLearner::AllocateGPUMemory() { // copy indices to the device - if (device_data_indices_[device_id] != NULL){ + if (device_data_indices_[device_id] != NULL) { CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); } @@ -427,7 +426,6 @@ void CUDATreeLearner::AllocateGPUMemory() { CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), (size_t) num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_)); } } - } void CUDATreeLearner::ResetGPUMemory() { @@ -435,14 +433,13 @@ void CUDATreeLearner::ResetGPUMemory() { // clear sparse/dense maps dense_feature_group_map_.clear(); sparse_feature_group_map_.clear(); - } // LGBM_CUDA void CUDATreeLearner::copyDenseFeature() { - if (num_feature_groups_ == 0){ - LGBM_config_::current_learner=use_cpu_learner; + if (num_feature_groups_ == 0) { + LGBM_config_::current_learner = use_cpu_learner; return; } @@ -462,29 +459,27 @@ void CUDATreeLearner::copyDenseFeature() { dense_feature_group_map_.push_back(i); auto sizes_in_byte = train_data_->FeatureGroupSizesInByte(i); void* tmp_data = train_data_->FeatureGroupData(i); - Log::Debug("Started copying dense features from CPU to GPU - 2"); + Log::Debug("Started copying dense features from CPU to GPU - 2"); CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(&device_features[copied_feature * num_data_], tmp_data, sizes_in_byte, cudaMemcpyHostToDevice, stream_[device_id])); - Log::Debug("Started copying dense features from CPU to GPU - 3"); + Log::Debug("Started copying dense features from CPU to GPU - 3"); copied_feature++; // reset device info - if(copied_feature == (size_t) num_gpu_feature_groups_[device_id]) { + if (copied_feature == (size_t) num_gpu_feature_groups_[device_id]) { CUDASUCCESS_OR_FATAL(cudaEventRecord(features_future_[device_id], stream_[device_id])); device_id += 1; copied_feature = 0; - if(device_id < num_gpu_) { + if (device_id < num_gpu_) { device_features = device_features_[device_id]; CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); } } - } - else { + } else { sparse_feature_group_map_.push_back(i); } } // data transfer time // LGBM_CUDA: async copy, so it is not the real data transfer time // std::chrono::duration end_time = std::chrono::steady_clock::now() - start_time; - } @@ -513,35 +508,31 @@ void CUDATreeLearner::InitGPU(int num_gpu) { #endif if (max_num_bin_ <= 16) { - device_bin_size_ = 16; //LGBM_CUDA + device_bin_size_ = 16; // LGBM_CUDA histogram_size_ = 16; - dword_features_ = 1; // LGBM_CUDA - } - else if (max_num_bin_ <= 64) { - device_bin_size_ = 64; //LGBM_CUDA + dword_features_ = 1; // LGBM_CUDA + } else if (max_num_bin_ <= 64) { + device_bin_size_ = 64; // LGBM_CUDA histogram_size_ = 64; - dword_features_ = 1; // LGBM_CUDA - } - else if ( max_num_bin_ <= 256) { + dword_features_ = 1; // LGBM_CUDA + } else if ( max_num_bin_ <= 256) { Log::Debug("device_bin_size_ = 256"); device_bin_size_ = 256; histogram_size_ = 256; - dword_features_ = 1; // LGBM_CUDA - } - else { + dword_features_ = 1; // LGBM_CUDA + } else { Log::Fatal("bin size %d cannot run on GPU", max_num_bin_); } - if(max_num_bin_ == 65) { + if (max_num_bin_ == 65) { Log::Warning("Setting max_bin to 63 is sugguested for best performance"); } - if(max_num_bin_ == 17) { + if (max_num_bin_ == 17) { Log::Warning("Setting max_bin to 15 is sugguested for best performance"); } // LGBM_CUDA: get num_dense_feature_groups_ CountDenseFeatureGroups(); - if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_; // LGBM_CUDA: initialize GPU @@ -552,7 +543,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { // LGBM_CUDA: set cpu threads cpu_threads_ = (pthread_t **)malloc(sizeof(pthread_t *)*num_gpu_); - for(int device_id = 0; device_id < num_gpu_; ++device_id) { + for (int device_id = 0; device_id < num_gpu_; ++device_id) { cpu_threads_[device_id] = (pthread_t *)malloc(sizeof(pthread_t)); } @@ -582,7 +573,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); //kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); - for(int i = 0; i < num_gpu_; ++i) { + for (int i = 0; i < num_gpu_; ++i) { CUDASUCCESS_OR_FATAL(cudaSetDevice(i)); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&(stream_[i]))); CUDASUCCESS_OR_FATAL(cudaEventCreate(&(hessians_future_[i]))); @@ -603,13 +594,12 @@ void CUDATreeLearner::InitGPU(int num_gpu) { if (!is_use_subset_) { Log::Debug("copyDenseFeature at the initialization\n"); - copyDenseFeature(); // LGBM_CUDA + copyDenseFeature(); // LGBM_CUDA } - } Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, Json& forced_split_json) { + bool is_constant_hessian, const Json& forced_split_json) { // check if we need to recompile the GPU kernel (is_constant_hessian changed) // this should rarely occur @@ -692,47 +682,47 @@ void CUDATreeLearner::BeforeTrain() { // Copy initial full hessians and gradients to GPU. // We start copying as early as possible, instead of at ConstructHistogram(). - if ((hessians_ != NULL) && (gradients_ != NULL)){ - if (!use_bagging_ && num_dense_feature_groups_) { + if ((hessians_ != NULL) && (gradients_ != NULL)) { + if (!use_bagging_ && num_dense_feature_groups_) { - Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_); + Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_); - for(int device_id = 0; device_id < num_gpu_; ++device_id) { - if (!is_constant_hessian_) { - Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_"); + for (int device_id = 0; device_id < num_gpu_; ++device_id) { + if (!is_constant_hessian_) { + Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_"); - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA - auto start_device_hessians_time = std::chrono::steady_clock::now(); - #endif + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + auto start_device_hessians_time = std::chrono::steady_clock::now(); + #endif - //const data_size_t* indices = data_partition_->indices(); - //data_size_t cnt = data_partition_->leaf_count(0); + // const data_size_t* indices = data_partition_->indices(); + // data_size_t cnt = data_partition_->leaf_count(0); - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA - device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time; - #endif + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time; + #endif - Log::Debug("queued copy of device_hessians_"); - } + Log::Debug("queued copy of device_hessians_"); + } - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA - auto start_device_gradients_time = std::chrono::steady_clock::now(); - #endif + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + auto start_device_gradients_time = std::chrono::steady_clock::now(); + #endif - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA - device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time; - #endif + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time; + #endif - Log::Debug("CUDATreeLearner::BeforeTrain: issued gradients_ -> device_gradients_"); - } - } + Log::Debug("CUDATreeLearner::BeforeTrain: issued gradients_ -> device_gradients_"); + } + } } #if 0 @@ -740,33 +730,33 @@ void CUDATreeLearner::BeforeTrain() { #endif // use bagging - if ((hessians_ != NULL) && (gradients_ != NULL)){ - if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) { + if ((hessians_ != NULL) && (gradients_ != NULL)) { + if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) { - // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram() - // copy used gradients and hessians to ordered buffer + // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram() + // copy used gradients and hessians to ordered buffer - const data_size_t* indices = data_partition_->indices(); - data_size_t cnt = data_partition_->leaf_count(0); + const data_size_t* indices = data_partition_->indices(); + data_size_t cnt = data_partition_->leaf_count(0); - // transfer the indices to GPU - for(int device_id = 0; device_id < num_gpu_; ++device_id) { + // transfer the indices to GPU + for (int device_id = 0; device_id < num_gpu_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); - if (!is_constant_hessian_) { + if (!is_constant_hessian_) { - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], (void *) &(hessians_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], (void *) &(hessians_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); - } + } - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], (void *) &(gradients_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], (void *) &(gradients_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); + } } } - } } bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { @@ -798,7 +788,7 @@ bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int r #if GPU_DEBUG >= 2 #endif - for(int device_id = 0; device_id < num_gpu_; ++device_id) { + for (int device_id = 0; device_id < num_gpu_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], &indices[begin], (end-begin) * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); } @@ -813,12 +803,10 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data) { - if (num_data <= 0) { return false; } - // do nothing if no features can be processed on GPU if (!num_dense_feature_groups_) { Log::Debug("no dense feature groups, returning"); @@ -828,7 +816,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( // copy data indices if it is not null if (data_indices != nullptr && num_data != num_data_) { - for(int device_id = 0; device_id < num_gpu_; ++device_id) { + for (int device_id = 0; device_id < num_gpu_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], data_indices, num_data * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); @@ -841,9 +829,9 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( #pragma omp parallel for schedule(static,1024) if (num_features_ >= 2048) for (int i = 0; i < num_features_; ++i) { - if(is_feature_used[i]) { - int feature_group = train_data_->Feature2Group(i); // LGBM_CUDA - is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group)<=16)? 2 : 1; // LGBM_CUDA + if (is_feature_used[i]) { + int feature_group = train_data_->Feature2Group(i); // LGBM_CUDA + is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group)<=16)? 2 : 1; // LGBM_CUDA } } @@ -855,8 +843,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( //feature_masks_[i] = 1; feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]]; ++used_dense_feature_groups; - } - else { + } else { feature_masks_[i] = 0; } } @@ -872,11 +859,11 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( // LGBM_CUDA We now copy even if all features are used. - #pragma omp parallel for schedule(static, num_gpu_) - for(int device_id = 0; device_id < num_gpu_; ++device_id) { - int offset = offset_gpu_feature_groups_[device_id]; - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id])); - } + #pragma omp parallel for schedule(static, num_gpu_) + for (int device_id = 0; device_id < num_gpu_; ++device_id) { + int offset = offset_gpu_feature_groups_[device_id]; + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id])); + } // All data have been prepared, now run the GPU kernel @@ -902,8 +889,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) { is_sparse_feature_used[feature_index] = 1; num_sparse_features++; - } - else { + } else { is_dense_feature_used[feature_index] = 1; num_dense_features++; } @@ -916,7 +902,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU - if (exp_workgroups_per_feature == 0){ + if (exp_workgroups_per_feature == 0) { return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); } @@ -926,7 +912,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // then construct sparse features on CPU // We set data_indices to null to avoid rebuilding ordered gradients/hessians - if (num_sparse_features > 0){ + if (num_sparse_features > 0) { train_data_->ConstructHistograms(is_sparse_feature_used, smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, @@ -940,8 +926,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ if (config_->gpu_use_dp) { // use double precision WaitAndGetHistograms(smaller_leaf_histogram_array_); - } - else { + } else { // use single precision WaitAndGetHistograms(smaller_leaf_histogram_array_); } @@ -1011,13 +996,12 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); if (num_data == num_data_) { printf("CompareHistograms reports %d errors\n", retval); - } - else { + } else { printf("CompareHistograms reports %d errors\n", retval); } std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); delete [] gpu_histogram; - //break; // LGBM_CUDA: see only first feature info + //break; // LGBM_CUDA: see only first feature info } printf("End Comparing Histogram between GPU and CPU\n"); fflush(stderr); @@ -1037,7 +1021,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // then construct sparse features on CPU // We set data_indices to null to avoid rebuilding ordered gradients/hessians - if (num_sparse_features > 0){ + if (num_sparse_features > 0) { //train_data_->ConstructHistograms(is_sparse_feature_used, // nullptr, larger_leaf_splits_->num_data_in_leaf(), // larger_leaf_splits_->leaf_index(), @@ -1058,8 +1042,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ if (config_->gpu_use_dp) { // use double precision WaitAndGetHistograms(larger_leaf_histogram_array_); - } - else { + } else { // use single precision WaitAndGetHistograms(larger_leaf_histogram_array_); } diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 7b256345c82..384ec57f66a 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -2,6 +2,12 @@ #ifndef LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ #define LGBM_TREELEARNET_CUDA_TREE_LEARNER_H_ +#include +#include +#include +#include +#include + #include #include #include @@ -13,12 +19,6 @@ #include "split_info.hpp" #include "leaf_splits.hpp" -#include -#include -#include -#include -#include - #ifdef USE_CUDA #include @@ -34,261 +34,257 @@ namespace LightGBM { * \brief CUDA-based parallel learning algorithm. */ class CUDATreeLearner: public SerialTreeLearner { -public: - explicit CUDATreeLearner(const Config* tree_config); - ~CUDATreeLearner(); - // LGBM_CUDA: is_use_subset is used by CUDA only - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; - void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; - Tree* Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, Json& forced_split_json) override; + public: + explicit CUDATreeLearner(const Config* tree_config); + ~CUDATreeLearner(); + // LGBM_CUDA: is_use_subset is used by CUDA only + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; + void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; + Tree* Train(const score_t* gradients, const score_t *hessians, + bool is_constant_hessian, const Json& forced_split_json) override; - void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { - SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); - // determine if we are using bagging before we construct the data partition - // thus we can start data movement to GPU earlier - if (subset == nullptr && used_indices != nullptr) { - if (num_data != num_data_) { - use_bagging_ = true; - return; + void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { + SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); + // determine if we are using bagging before we construct the data partition + // thus we can start data movement to GPU earlier + if (subset == nullptr && used_indices != nullptr) { + if (num_data != num_data_) { + use_bagging_ = true; + return; + } } + use_bagging_ = false; } - use_bagging_ = false; - } -protected: - void BeforeTrain() override; - bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override; - void FindBestSplits() override; - void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; - void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; -private: - /*! \brief 4-byte feature tuple used by GPU kernels */ - //struct Feature4 { - // uint8_t s[4]; - //}; + protected: + void BeforeTrain() override; + bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override; + void FindBestSplits() override; + void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; + void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; + private: + /*! \brief 4-byte feature tuple used by GPU kernels */ + //struct Feature4 { + // uint8_t s[4]; + //}; - typedef float gpu_hist_t; + typedef float gpu_hist_t; - /*! - * \brief Find the best number of workgroups processing one feature for maximizing efficiency - * \param leaf_num_data The number of data examples on the current leaf being processed - * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature - */ - int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data); + /*! + * \brief Find the best number of workgroups processing one feature for maximizing efficiency + * \param leaf_num_data The number of data examples on the current leaf being processed + * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature + */ + int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data); - /*! - * \brief Initialize GPU device - * \LGBM_CUDA: param num_gpu: number of maximum gpus - */ - void InitGPU(int num_gpu); + /*! + * \brief Initialize GPU device + * \LGBM_CUDA: param num_gpu: number of maximum gpus + */ + void InitGPU(int num_gpu); - /*! - * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only - */ - void CountDenseFeatureGroups(); // compute num_dense_feature_group - void prevAllocateGPUMemory(); // compute CPU-side param calculation & Pin HostMemory - void AllocateGPUMemory(); + /*! + * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only + */ + void CountDenseFeatureGroups(); // compute num_dense_feature_group + void prevAllocateGPUMemory(); // compute CPU-side param calculation & Pin HostMemory + void AllocateGPUMemory(); - /*! - * \ LGBM_CUDA: ResetGPUMemory - */ - void ResetGPUMemory(); + /*! + * \ LGBM_CUDA: ResetGPUMemory + */ + void ResetGPUMemory(); - /*! - * \ LGBM_CUDA: copy dense feature from CPU to GPU - */ - void copyDenseFeature(); + /*! + * \ LGBM_CUDA: copy dense feature from CPU to GPU + */ + void copyDenseFeature(); - - /*! - * \brief Compute GPU feature histogram for the current leaf. - * Indices, gradients and hessians have been copied to the device. - * \param leaf_num_data Number of data on current leaf - * \param use_all_features Set to true to not use feature masks, with a faster kernel - */ - void GPUHistogram(data_size_t leaf_num_data, bool use_all_features); + /*! + * \brief Compute GPU feature histogram for the current leaf. + * Indices, gradients and hessians have been copied to the device. + * \param leaf_num_data Number of data on current leaf + * \param use_all_features Set to true to not use feature masks, with a faster kernel + */ + void GPUHistogram(data_size_t leaf_num_data, bool use_all_features); - void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size, - int leaf_num_data, bool use_all_features, - int num_workgroups, int exp_workgroups_per_feature) { - ThreadData* td = &thread_data[device_id]; - td->device_id = device_id; - td->histogram_size = histogram_size; - td->leaf_num_data = leaf_num_data; - td->num_data = num_data_; - td->use_all_features = use_all_features; - td->is_constant_hessian = is_constant_hessian_; - td->num_workgroups = num_workgroups; - td->stream = stream_[device_id]; - td->device_features = device_features_[device_id]; - td->device_feature_masks = reinterpret_cast(device_feature_masks_[device_id]); - td->device_data_indices = reinterpret_cast(device_data_indices_[device_id]); - td->device_gradients = device_gradients_[device_id]; - td->device_hessians = device_hessians_[device_id]; - td->hessians_const = hessians_[0]; - td->device_subhistograms = device_subhistograms_[device_id]; - td->sync_counters = sync_counters_[device_id]; - td->device_histogram_outputs= device_histogram_outputs_[device_id]; - td->exp_workgroups_per_feature = exp_workgroups_per_feature; + void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size, + int leaf_num_data, bool use_all_features, + int num_workgroups, int exp_workgroups_per_feature) { + ThreadData* td = &thread_data[device_id]; + td->device_id = device_id; + td->histogram_size = histogram_size; + td->leaf_num_data = leaf_num_data; + td->num_data = num_data_; + td->use_all_features = use_all_features; + td->is_constant_hessian = is_constant_hessian_; + td->num_workgroups = num_workgroups; + td->stream = stream_[device_id]; + td->device_features = device_features_[device_id]; + td->device_feature_masks = reinterpret_cast(device_feature_masks_[device_id]); + td->device_data_indices = reinterpret_cast(device_data_indices_[device_id]); + td->device_gradients = device_gradients_[device_id]; + td->device_hessians = device_hessians_[device_id]; + td->hessians_const = hessians_[0]; + td->device_subhistograms = device_subhistograms_[device_id]; + td->sync_counters = sync_counters_[device_id]; + td->device_histogram_outputs = device_histogram_outputs_[device_id]; + td->exp_workgroups_per_feature = exp_workgroups_per_feature; - td->kernel_start = &(kernel_start_[device_id]); - td->kernel_wait_obj = &(kernel_wait_obj_[device_id]); - td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]); + td->kernel_start = &(kernel_start_[device_id]); + td->kernel_wait_obj = &(kernel_wait_obj_[device_id]); + td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]); - size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - td->output_size = output_size; - td->host_histogram_output = (char*)host_histogram_outputs_ + host_output_offset; - td->histograms_wait_obj = &(histograms_wait_obj_[device_id]); - } + size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; + size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; + td->output_size = output_size; + td->host_histogram_output = (char*)host_histogram_outputs_ + host_output_offset; + td->histograms_wait_obj = &(histograms_wait_obj_[device_id]); + } + // LGBM_CUDA: thread work + // typedef void * (*THREADFUNCPTR)(void *); + // void* launch_gpu_kernel(void *td); - // LGBM_CUDA: thread work - //typedef void * (*THREADFUNCPTR)(void *); - //void* launch_gpu_kernel(void *td); - - /*! - * \brief Wait for GPU kernel execution and read histogram - * \param histograms Destination of histogram results from GPU. - */ - template - void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array); - - /*! - * \brief Construct GPU histogram asynchronously. - * Interface is similar to Dataset::ConstructHistograms(). - * \param is_feature_used A predicate vector for enabling each feature - * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU. - * Set to nullptr to skip copy to GPU. - * \param num_data Number of data examples to be included in histogram - * \param gradients Array of gradients for all examples. - * \param hessians Array of hessians for all examples. - * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, - * Set gradients to nullptr to skip copy to GPU. - * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, - * Set hessians to nullptr to skip copy to GPU. - * \return true if GPU kernel is launched, false if GPU is not used - */ - // LGBM_CUDA v5.2 - bool ConstructGPUHistogramsAsync( - const std::vector& is_feature_used, - const data_size_t* data_indices, data_size_t num_data); + /*! + * \brief Wait for GPU kernel execution and read histogram + * \param histograms Destination of histogram results from GPU. + */ + template + void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array); + /*! + * \brief Construct GPU histogram asynchronously. + * Interface is similar to Dataset::ConstructHistograms(). + * \param is_feature_used A predicate vector for enabling each feature + * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU. + * Set to nullptr to skip copy to GPU. + * \param num_data Number of data examples to be included in histogram + * \param gradients Array of gradients for all examples. + * \param hessians Array of hessians for all examples. + * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, + * Set gradients to nullptr to skip copy to GPU. + * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, + * Set hessians to nullptr to skip copy to GPU. + * \return true if GPU kernel is launched, false if GPU is not used + */ + // LGBM_CUDA v5.2 + bool ConstructGPUHistogramsAsync( + const std::vector& is_feature_used, + const data_size_t* data_indices, data_size_t num_data); - /*! brief Log2 of max number of workgroups per feature*/ - const int kMaxLogWorkgroupsPerFeature = 10; // 2^10 - /*! brief Max total number of workgroups with preallocated workspace. - * If we use more than this number of workgroups, we have to reallocate subhistograms */ - //int preallocd_max_num_wg_ = 1024; - std::vector preallocd_max_num_wg_; + /*! brief Log2 of max number of workgroups per feature*/ + const int kMaxLogWorkgroupsPerFeature = 10; // 2^10 + /*! brief Max total number of workgroups with preallocated workspace. + * If we use more than this number of workgroups, we have to reallocate subhistograms */ + // int preallocd_max_num_wg_ = 1024; + std::vector preallocd_max_num_wg_; - /*! \brief True if bagging is used */ - bool use_bagging_; + /*! \brief True if bagging is used */ + bool use_bagging_; - /*! \brief GPU device object */ - //int* dev_; - /*! \brief GPU command queue object */ - std::vector stream_; + /*! \brief GPU device object */ + // int* dev_; + /*! \brief GPU command queue object */ + std::vector stream_; - /*! \brief total number of feature-groups */ - int num_feature_groups_; - /*! \brief total number of dense feature-groups, which will be processed on GPU */ - int num_dense_feature_groups_; - std::vector num_gpu_feature_groups_; // LGBM_CUDA - std::vector offset_gpu_feature_groups_; // LGBM_CUDA - /*! \brief On GPU we read one DWORD (4-byte) of features of one example once. - * With bin size > 16, there are 4 features per DWORD. - * With bin size <=16, there are 8 features per DWORD. - * */ - int dword_features_; - /*! \brief total number of dense feature-group tuples on GPU. - * Each feature tuple is 4-byte (4 features if each feature takes a byte) */ - //int num_dense_feature4_; - /*! \brief Max number of bins of training data, used to determine - * which GPU kernel to use */ - int max_num_bin_; - /*! \brief Used GPU kernel bin size (64, 256) */ - int histogram_size_; - int device_bin_size_; - /*! \brief Size of histogram bin entry, depending if single or double precision is used */ - size_t hist_bin_entry_sz_; - /*! \brief Indices of all dense feature-groups */ - std::vector dense_feature_group_map_; - /*! \brief Indices of all sparse feature-groups */ - std::vector sparse_feature_group_map_; - /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */ - //std::vector device_bin_mults_; - /*! \brief GPU memory object holding the training data */ - //uint8_t *device_features_; - std::vector device_features_; - /*! \brief GPU memory object holding the ordered gradient */ - //score_t *device_gradients_; - std::vector device_gradients_; - /*! \brief Pointer to pinned memory of ordered gradient */ - void * ptr_pinned_gradients_ = nullptr; - /*! \brief GPU memory object holding the ordered hessian */ - //score_t *device_hessians_; - std::vector device_hessians_; - /*! \brief Pointer to pinned memory of ordered hessian */ - void * ptr_pinned_hessians_ = nullptr; - /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */ -// std::vector> feature_masks_; - std::vector feature_masks_; - /*! \brief GPU memory object holding the feature masks */ - //void *device_feature_masks_; - std::vector device_feature_masks_; - /*! \brief Pointer to pinned memory of feature masks */ - char* ptr_pinned_feature_masks_ = nullptr; - /*! \brief GPU memory object holding indices of the leaf being processed */ - //data_size_t *device_data_indices_; - std::vector device_data_indices_; - /*! \brief GPU memory object holding counters for workgroup coordination */ - //int *sync_counters_; - std::vector sync_counters_; - /*! \brief GPU memory object holding temporary sub-histograms per workgroup */ - //char *device_subhistograms_; - std::vector device_subhistograms_; - /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */ - // FIXME: is this cuda mapped - //void *device_histogram_outputs_; - std::vector device_histogram_outputs_; - /*! \brief Host memory pointer for histogram outputs */ - void *host_histogram_outputs_; - /*! \LGBM_CUDA: CUDA waitlist object for waiting for data transfer before kernel execution */ - //cudaEvent_t kernel_wait_obj_; - std::vector kernel_wait_obj_; - /*! \LGBM_CUDA: CUDA waitlist object for reading output histograms after kernel execution */ - //cudaEvent_t histograms_wait_obj_; - std::vector histograms_wait_obj_; - /*! \LGBM_CUDA: CUDA Asynchronous waiting object for copying indices */ - //cudaEvent_t indices_future_; - std::vector indices_future_; - /*! \LGBM_CUDA: Asynchronous waiting object for copying gradients */ - //cudaEvent_t gradients_future_; - std::vector gradients_future_; - /*! \LGBM_CUDA: Asynchronous waiting object for copying hessians */ - //cudaEvent_t hessians_future_; - std::vector hessians_future_; - // LGBM_CUDA:\brief Asynchronous waiting object for copying dense features - //cudaEvent_t features_future_; - std::vector features_future_; + /*! \brief total number of feature-groups */ + int num_feature_groups_; + /*! \brief total number of dense feature-groups, which will be processed on GPU */ + int num_dense_feature_groups_; + std::vector num_gpu_feature_groups_; // LGBM_CUDA + std::vector offset_gpu_feature_groups_; // LGBM_CUDA + /*! \brief On GPU we read one DWORD (4-byte) of features of one example once. + * With bin size > 16, there are 4 features per DWORD. + * With bin size <=16, there are 8 features per DWORD. + */ + int dword_features_; + /*! \brief total number of dense feature-group tuples on GPU. + * Each feature tuple is 4-byte (4 features if each feature takes a byte) */ + // int num_dense_feature4_; + /*! \brief Max number of bins of training data, used to determine + * which GPU kernel to use */ + int max_num_bin_; + /*! \brief Used GPU kernel bin size (64, 256) */ + int histogram_size_; + int device_bin_size_; + /*! \brief Size of histogram bin entry, depending if single or double precision is used */ + size_t hist_bin_entry_sz_; + /*! \brief Indices of all dense feature-groups */ + std::vector dense_feature_group_map_; + /*! \brief Indices of all sparse feature-groups */ + std::vector sparse_feature_group_map_; + /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */ + //std::vector device_bin_mults_; + /*! \brief GPU memory object holding the training data */ + //uint8_t *device_features_; + std::vector device_features_; + /*! \brief GPU memory object holding the ordered gradient */ + //score_t *device_gradients_; + std::vector device_gradients_; + /*! \brief Pointer to pinned memory of ordered gradient */ + void * ptr_pinned_gradients_ = nullptr; + /*! \brief GPU memory object holding the ordered hessian */ + // score_t *device_hessians_; + std::vector device_hessians_; + /*! \brief Pointer to pinned memory of ordered hessian */ + void * ptr_pinned_hessians_ = nullptr; + /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */ + // std::vector> feature_masks_; + std::vector feature_masks_; + /*! \brief GPU memory object holding the feature masks */ + //void *device_feature_masks_; + std::vector device_feature_masks_; + /*! \brief Pointer to pinned memory of feature masks */ + char* ptr_pinned_feature_masks_ = nullptr; + /*! \brief GPU memory object holding indices of the leaf being processed */ + // data_size_t *device_data_indices_; + std::vector device_data_indices_; + /*! \brief GPU memory object holding counters for workgroup coordination */ + // int *sync_counters_; + std::vector sync_counters_; + /*! \brief GPU memory object holding temporary sub-histograms per workgroup */ + // char *device_subhistograms_; + std::vector device_subhistograms_; + /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */ + // FIXME: is this cuda mapped + // void *device_histogram_outputs_; + std::vector device_histogram_outputs_; + /*! \brief Host memory pointer for histogram outputs */ + void *host_histogram_outputs_; + /*! \LGBM_CUDA: CUDA waitlist object for waiting for data transfer before kernel execution */ + // cudaEvent_t kernel_wait_obj_; + std::vector kernel_wait_obj_; + /*! \LGBM_CUDA: CUDA waitlist object for reading output histograms after kernel execution */ + // cudaEvent_t histograms_wait_obj_; + std::vector histograms_wait_obj_; + /*! \LGBM_CUDA: CUDA Asynchronous waiting object for copying indices */ + // cudaEvent_t indices_future_; + std::vector indices_future_; + /*! \LGBM_CUDA: Asynchronous waiting object for copying gradients */ + // cudaEvent_t gradients_future_; + std::vector gradients_future_; + /*! \LGBM_CUDA: Asynchronous waiting object for copying hessians */ + // cudaEvent_t hessians_future_; + std::vector hessians_future_; + // LGBM_CUDA:\brief Asynchronous waiting object for copying dense features + // cudaEvent_t features_future_; + std::vector features_future_; - // LGBM_CUDA: use subset of training data for bagging - bool is_use_subset_; + // LGBM_CUDA: use subset of training data for bagging + bool is_use_subset_; - // LGBM_CUDA: host-side buffer for converting feature data into featre4 data - //std::vector host_vecs_; - int nthreads_; // number of Feature4* vector on host4_vecs_ - //cudaEvent_t kernel_start_; // event for kernel start - std::vector kernel_start_; - std::vector kernel_time_; // measure histogram kernel time - std::vector> kernel_input_wait_time_; - int num_gpu_; - int allocated_num_data_; // allocated data instances - pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu + // LGBM_CUDA: host-side buffer for converting feature data into featre4 data + // std::vector host_vecs_; + int nthreads_; // number of Feature4* vector on host4_vecs_ + // cudaEvent_t kernel_start_; // event for kernel start + std::vector kernel_start_; + std::vector kernel_time_; // measure histogram kernel time + std::vector> kernel_input_wait_time_; + int num_gpu_; + int allocated_num_data_; // allocated data instances + pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu }; - } // namespace LightGBM #else // USE_CUDA @@ -297,15 +293,15 @@ class CUDATreeLearner: public SerialTreeLearner { namespace LightGBM { class CUDATreeLearner: public SerialTreeLearner { -public: - #pragma warning(disable : 4702) - explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) { - Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" - "Please recompile with CMake option -DUSE_CUDA=1"); - } + public: + #pragma warning(disable : 4702) + explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) { + Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" + "Please recompile with CMake option -DUSE_CUDA=1"); + } }; } -#endif //USE_CUDA +#endif // USE_CUDA #endif // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 0624bb96249..31425c77cd3 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -20,9 +20,9 @@ DataParallelTreeLearner::~DataParallelTreeLearner() { } template -void DataParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { //LGBM_CUDA +void DataParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { // LGBM_CUDA // initialize SerialTreeLearner - TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); //LGBM_CUDA + TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA // Get local rank and global machine size rank_ = Network::rank(); num_machines_ = Network::num_machines(); @@ -256,7 +256,7 @@ void DataParallelTreeLearner::Split(Tree* tree, int best_Leaf, in } // instantiate template classes, otherwise linker cannot find the code -template class DataParallelTreeLearner; // LGBM_CUDA +template class DataParallelTreeLearner; // LGBM_CUDA template class DataParallelTreeLearner; template class DataParallelTreeLearner; diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp index 5cf660ab9c9..5a820328ddb 100644 --- a/src/treelearner/feature_parallel_tree_learner.cpp +++ b/src/treelearner/feature_parallel_tree_learner.cpp @@ -19,9 +19,9 @@ template FeatureParallelTreeLearner::~FeatureParallelTreeLearner() { } -template //LGBM_CUDA +template // LGBM_CUDA void FeatureParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { - TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); //LGBM_CUDA + TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA rank_ = Network::rank(); num_machines_ = Network::num_machines(); @@ -77,7 +77,7 @@ void FeatureParallelTreeLearner::FindBestSplitsFromHistograms( } // instantiate template classes, otherwise linker cannot find the code -template class FeatureParallelTreeLearner; // LGBM_CUDA +template class FeatureParallelTreeLearner; // LGBM_CUDA template class FeatureParallelTreeLearner; template class FeatureParallelTreeLearner; } // namespace LightGBM diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index fad02e1c044..7fb2a340a65 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -735,7 +735,7 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) { } Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, Json& forced_split_json) { + bool is_constant_hessian, const Json& forced_split_json) { return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json); } diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h index 598e8d40ac9..2ed29bcd1f7 100644 --- a/src/treelearner/gpu_tree_learner.h +++ b/src/treelearner/gpu_tree_learner.h @@ -49,7 +49,7 @@ class GPUTreeLearner: public SerialTreeLearner { void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; void ResetIsConstantHessian(bool is_constant_hessian) override; Tree* Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, Json& forced_split_json) override; + bool is_constant_hessian, const Json& forced_split_json) override; void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index a0780f913c9..7831159160b 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -1,8 +1,8 @@ -/* +/*! * ibmGBT: IBM CUDA Accelerated LightGBM * * IBM Confidential - * (C) Copyright IBM Corp. 2019. All Rights Reserved. + * Copyright (c) 2019 IBM Corporation. All rights reserved. * * The source code for this program is not published or otherwise * divested of its trade secrets, irrespective of what has been @@ -15,7 +15,7 @@ #include "histogram_16_64_256.hu" #include "stdio.h" -#define PRINT(b,t,fmt,...) \ +#define PRINT(b, t, fmt, ...) \ if (b == gtid && t == ltid) { \ printf(fmt, __VA_ARGS__); \ } @@ -132,7 +132,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; const ushort ltid = threadIdx.x; - const ushort lsize = NUM_BINS; // get_local_size(0); + const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; // local memory per workgroup is 3 KB @@ -185,7 +185,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, if (!feature_mask) { return; } else { - feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) } // STAGE 1: read feature data, and gradient and hessian @@ -291,7 +291,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #if CONST_HESSIAN == 1 // make a final reduction gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position __syncthreads(); #endif @@ -358,7 +358,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { - uint old_val; // dummy + uint old_val; // dummy #endif // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); @@ -486,7 +486,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; const ushort ltid = threadIdx.x; - const ushort lsize = NUM_BINS; // get_local_size(0); + const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; // local memory per workgroup is 3 KB @@ -539,7 +539,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, if (!feature_mask) { return; } else { - feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) } // STAGE 1: read feature data, and gradient and hessian @@ -645,7 +645,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #if CONST_HESSIAN == 1 // make a final reduction gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position __syncthreads(); #endif @@ -712,7 +712,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { - uint old_val; // dummy + uint old_val; // dummy #endif // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); @@ -841,7 +841,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; const ushort ltid = threadIdx.x; - const ushort lsize = NUM_BINS; // get_local_size(0); + const ushort lsize = NUM_BINS; // get_local_size(0); const ushort group_id = blockIdx.x; // local memory per workgroup is 3 KB @@ -893,7 +893,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, if (!feature_mask) { return; } else { - feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) } // STAGE 1: read feature data, and gradient and hessian @@ -905,7 +905,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ushort bin; feature = feature_data[ind >> feature_mask]; - if (feature_mask) { + if (feature_mask) { feature = (feature >> ((ind & 1) << 2)) & 0xf; } bin = feature; @@ -997,7 +997,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #if CONST_HESSIAN == 1 // make a final reduction gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position __syncthreads(); #endif @@ -1064,7 +1064,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { - uint old_val; // dummy + uint old_val; // dummy #endif // locate our feature's block in output memory uint output_offset = (feature_id << power_feature_workgroups); diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu index 1a875588cc4..4dfcb9f7895 100644 --- a/src/treelearner/kernels/histogram_16_64_256.hu +++ b/src/treelearner/kernels/histogram_16_64_256.hu @@ -1,8 +1,8 @@ -/* +/*! * ibmGBT: IBM CUDA Accelerated LightGBM * * IBM Confidential - * (C) Copyright IBM Corp. 2019. All Rights Reserved. + * Copyright (c) 2019 IBM Corporation. All rights reserved. * * The source code for this program is not published or otherwise * divested of its trade secrets, irrespective of what has been diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 2fdf542d421..222955a3c94 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -28,7 +28,7 @@ class FeatureParallelTreeLearner: public TREELEARNER_T { public: explicit FeatureParallelTreeLearner(const Config* config); ~FeatureParallelTreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA protected: void BeforeTrain() override; @@ -55,7 +55,7 @@ class DataParallelTreeLearner: public TREELEARNER_T { public: explicit DataParallelTreeLearner(const Config* config); ~DataParallelTreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA void ResetConfig(const Config* config) override; protected: @@ -109,7 +109,7 @@ class VotingParallelTreeLearner: public TREELEARNER_T { public: explicit VotingParallelTreeLearner(const Config* config); ~VotingParallelTreeLearner() { } - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; //LGBM_CUDA + void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA void ResetConfig(const Config* config) override; protected: diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index e5b6626a6bd..3fdcbca23a0 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -25,9 +25,9 @@ SerialTreeLearner::SerialTreeLearner(const Config* config) SerialTreeLearner::~SerialTreeLearner() { } -//LGBM_CUDA +// LGBM_CUDA void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { - (void)is_use_subset; // UNUSED + (void)is_use_subset; // UNUSED train_data_ = train_data; num_data_ = train_data_->num_data(); num_features_ = train_data_->num_features(); @@ -150,7 +150,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) { constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves)); } -Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) { +Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) { Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer); gradients_ = gradients; hessians_ = hessians; @@ -441,7 +441,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } } -int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf, +int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf, int* right_leaf, int *cur_depth, bool *aborted_last_force_split) { (void)aborted_last_force_split; diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 367c262192c..23fc75659ad 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -80,7 +80,7 @@ class SerialTreeLearner: public TreeLearner { } Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, - Json& forced_split_json) override; + const Json& forced_split_json) override; Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override; @@ -163,7 +163,7 @@ class SerialTreeLearner: public TreeLearner { bool update_cnt); /* Force splits with forced_split_json dict and then return num splits forced.*/ - virtual int32_t ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf, + virtual int32_t ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf, int* right_leaf, int* cur_depth, bool *aborted_last_force_split); @@ -209,11 +209,11 @@ class SerialTreeLearner: public TreeLearner { std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */ std::vector> ordered_hessians_; -#elif USE_CUDA //LGBM_CUDA +#elif USE_CUDA // LGBM_CUDA /*! \brief gradients of current iteration, ordered for cache optimized */ - std::vector> ordered_gradients_; + std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized */ - std::vector> ordered_hessians_; + std::vector> ordered_hessians_; #else /*! \brief gradients of current iteration, ordered for cache optimized */ std::vector> ordered_gradients_; diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index df7231e91df..d47b469f950 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -5,7 +5,7 @@ #include #include "gpu_tree_learner.h" -#include "cuda_tree_learner.h" // LGBM_CUDA +#include "cuda_tree_learner.h" // LGBM_CUDA #include "parallel_tree_learner.h" #include "serial_tree_learner.h" @@ -32,7 +32,7 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con } else if (learner_type == std::string("voting")) { return new VotingParallelTreeLearner(config); } - } else if (device_type == std::string("cuda")) { // LGBM_CUDA + } else if (device_type == std::string("cuda")) { // LGBM_CUDA if (learner_type == std::string("serial")) { return new CUDATreeLearner(config); } else if (learner_type == std::string("feature")) { diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 58f5b88d6b0..4b120975c26 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -19,8 +19,8 @@ VotingParallelTreeLearner::VotingParallelTreeLearner(const Config } template -void VotingParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { // LGBM_CUDA - TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA +void VotingParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { // LGBM_CUDA + TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA rank_ = Network::rank(); num_machines_ = Network::num_machines(); @@ -454,7 +454,7 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, } // instantiate template classes, otherwise linker cannot find the code -template class VotingParallelTreeLearner; // LGBM_CUDA +template class VotingParallelTreeLearner; // LGBM_CUDA template class VotingParallelTreeLearner; template class VotingParallelTreeLearner; } // namespace LightGBM From bdcbeaa03563e330eb3935df2fc255134629bddc Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 15 Jun 2020 23:18:58 +0000 Subject: [PATCH 073/119] Another large lint cleanup - more to come. --- include/LightGBM/cuda/cuda_utils.h | 2 +- include/LightGBM/cuda/vector_cudahost.h | 16 +- src/boosting/gbdt.cpp | 51 ++--- src/boosting/gbdt.h | 4 +- src/c_api.cpp | 2 +- src/io/config_auto.cpp | 3 +- src/io/dataset.cpp | 24 +-- src/io/dense_bin.hpp | 12 +- src/treelearner/cuda_kernel_launcher.cu | 185 +++++++++--------- src/treelearner/cuda_tree_learner.cpp | 64 +++--- src/treelearner/cuda_tree_learner.h | 36 ++-- .../feature_parallel_tree_learner.cpp | 2 +- src/treelearner/serial_tree_learner.cpp | 9 +- src/treelearner/serial_tree_learner.h | 2 +- 14 files changed, 195 insertions(+), 217 deletions(-) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 7ff7b28c8f1..2fb45384f0c 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -14,7 +14,7 @@ #include #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index b964fa4ad1f..41a27c349bd 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -22,9 +22,9 @@ namespace LightGBM { #define use_cuda_learner 2 class LGBM_config_ { - public: - static int current_device; // Default: lgbm_device_cpu - static int current_learner; // Default: use_cpu_learner + public: + static int current_device; // Default: lgbm_device_cpu + static int current_learner; // Default: use_cpu_learner }; } // namespace LightGBM @@ -48,9 +48,9 @@ struct CHAllocator { } else { ptr = reinterpret_cast(malloc(n*sizeof(T))); } - #else + #else ptr = reinterpret_cast(malloc(n*sizeof(T))); - #endif + #endif return ptr; } @@ -60,17 +60,17 @@ struct CHAllocator { #ifdef USE_CUDA if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) { cudaPointerAttributes attributes; - cudaPointerGetAttributes (&attributes, p); + cudaPointerGetAttributes(&attributes, p); if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { cudaFreeHost(p); } - } else { + } else { free(p); } #else free(p); #endif - } + } }; template bool operator==(const CHAllocator&, const CHAllocator&); diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index d102f6aedc1..baeacc6beb9 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -128,8 +128,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective if (config_->device_type == std::string("cuda")) { if (is_use_subset_) { tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_); - } - else { + } else { tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_); } } else { @@ -262,7 +261,7 @@ void GBDT::Bagging(int iter) { tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); } else { // LGBM_CUDA // NEW get subset - bool resized= tmp_subset_->ReSize(bag_data_cnt_); + bool resized = tmp_subset_->ReSize(bag_data_cnt_); if (resized && (config_->device_type == std::string("cuda"))) { size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; @@ -284,7 +283,7 @@ void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { Common::FunctionTimer fun_timer("GBDT::Train", global_timer); bool is_finished = false; - // LGBM_CUDA + // LGBM_CUDA auto start_time = std::chrono::steady_clock::now(); for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) { @@ -379,13 +378,11 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) { // LGBM_CUDA bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { + // LGBM_CUDA invoke bagging during the first iteration + if (config_->device_type == std::string("cuda") && (iter_ == 0)) { + // auto start_time = std::chrono::steady_clock::now(); - // LGBM_CUDA invoke bagging during the first iteration - if (config_->device_type == std::string("cuda") && (iter_ == 0)) { - -// auto start_time = std::chrono::steady_clock::now(); - - Bagging(iter_); + Bagging(iter_); } std::vector init_scores(num_tree_per_iteration_, 0.0); @@ -397,7 +394,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { } // LGBM_CUDA -// auto start_time = std::chrono::steady_clock::now(); + // auto start_time = std::chrono::steady_clock::now(); Boosting(); @@ -406,11 +403,10 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { } // LGBM_CUDA bagging logic - // Bagging(iter_); + // Bagging(iter_); bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { - // LGBM_CUDA // auto start_time = std::chrono::steady_clock::now(); @@ -418,7 +414,6 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { std::unique_ptr new_tree(new Tree(2)); if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) { - auto grad = gradients + offset; auto hess = hessians + offset; @@ -434,8 +429,7 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { // need to copy gradients for bagging subset. if (is_use_subset_ && bag_data_cnt_ < num_data_) { - - #pragma omp parallel for schedule(static) // LGBM_CUDA + #pragma omp parallel for schedule(static) // LGBM_CUDA for (int i = 0; i < bag_data_cnt_; ++i) { tmp_grad[i] = grad[bag_data_indices_[i]]; // LGBM_CUDA tmp_hess[i] = hess[bag_data_indices_[i]]; // LGBM_CUDA @@ -482,11 +476,10 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { int iter_next = iter_ + 1; if (iter_next < config_->num_iterations) { -// auto start_time = std::chrono::steady_clock::now(); + // auto start_time = std::chrono::steady_clock::now(); // bagging logic Bagging(iter_next); - } } // add model @@ -508,7 +501,6 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { } bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { - if (config_->device_type == std::string("cuda")) { // LGBM_CUDA return TrainOneIterCUDA(gradients, hessians); } @@ -966,17 +958,16 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { } else { bag_data_cnt_ = num_data_; if (config_->device_type == std::string("cuda")) { // LGBM_CUDA - if (tmp_subset_ == nullptr){ - tmp_subset_.reset(new Dataset(bag_data_cnt_)); - tmp_subset_->CopyFeatureMapperFrom(train_data_); - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - tmp_gradients_.resize(total_size); - tmp_hessians_.resize(total_size); - is_use_subset_ = false; - bag_data_indices_.clear(); - } - } - else { + if (tmp_subset_ == nullptr){ + tmp_subset_.reset(new Dataset(bag_data_cnt_)); + tmp_subset_->CopyFeatureMapperFrom(train_data_); + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + tmp_gradients_.resize(total_size); + tmp_hessians_.resize(total_size); + is_use_subset_ = false; + bag_data_indices_.clear(); + } + } else { bag_data_indices_.clear(); bagging_runner_.ReSize(0); is_use_subset_ = false; diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 99bf64a6fb0..d22b6687766 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -478,8 +478,8 @@ class GBDT : public GBDTBase { #ifdef USE_CUDA /*! \brief First order derivative of training data */ - std::vector> gradients_; // LGBM_CUDA - std::vector> tmp_gradients_; // LGBM_CUDA + std::vector> gradients_; // LGBM_CUDA + std::vector> tmp_gradients_; // LGBM_CUDA /*! \brief Second order derivative of training data */ std::vector> hessians_; // LGBM_CUDA std::vector> tmp_hessians_; // LGBM_CUDA diff --git a/src/c_api.cpp b/src/c_api.cpp index 0ce92342fb6..6cdebc34aed 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -132,7 +132,7 @@ class Booster { #ifdef USE_CUDA // Only use CUDA when the data is large (2048 == 256 bins each with at least 8 elements) - if (train_data->num_data() < 2048){ + if (train_data->num_data() < 2048) { config_.device_type = std::string("cpu"); } #endif diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index b0cd57deb69..ba9c07cb547 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -618,9 +618,8 @@ void Config::GetMembersFromString(const std::unordered_map 0); + CHECK_GT(num_gpu, 0); #endif - } std::string Config::SaveMembersToString() const { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index df18ef6f838..416d66695a3 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -239,14 +239,14 @@ std::vector> FindGroups( } std::vector> FastFeatureBundling(const std::vector>& bin_mappers, - int** sample_indices, - double** sample_values, + int** sample_indices, + double** sample_values, const int* num_per_col, - int num_sample_col, + int num_sample_col, data_size_t total_sample_cnt, - const std::vector& used_features, + const std::vector& used_features, data_size_t num_data, - bool is_sparse, + bool is_sparse, std::vector* multi_val_group, bool is_use_gpu) { Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer); @@ -355,15 +355,15 @@ void Dataset::Construct(std::vector>* bin_mappers, std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); // LGBM_CUDA - features_in_group = FastFeatureBundling(*bin_mappers, - sample_non_zero_indices, - sample_values, + features_in_group = FastFeatureBundling(*bin_mappers, + sample_non_zero_indices, + sample_values, num_per_col, - num_sample_col, + num_sample_col, static_cast(total_sample_cnt), - used_features, - num_data_, - io_config.is_enable_sparse, + used_features, + num_data_, + io_config.is_enable_sparse, &group_is_multi_val, lgbm_is_gpu_used); } diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 89f29a99bdc..0eb37043842 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -464,16 +464,16 @@ class DenseBin : public Bin { DenseBin* Clone() override; private: - data_size_t num_data_; + data_size_t num_data_; #ifdef USE_CUDA - std::vector> data_; // LGBM_CUDA + std::vector> data_; // LGBM_CUDA #else - std::vector> data_; + std::vector> data_; #endif - std::vector buf_; + std::vector buf_; - DenseBin(const DenseBin& other) - : num_data_(other.num_data_), data_(other.data_) {} + DenseBin(const DenseBin& other) + : num_data_(other.num_data_), data_(other.data_) {} }; template diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu index 6e3149dae06..f8c3effeb6c 100644 --- a/src/treelearner/cuda_kernel_launcher.cu +++ b/src/treelearner/cuda_kernel_launcher.cu @@ -1,13 +1,17 @@ - #ifdef USE_CUDA - - #include "cuda_kernel_launcher.h" - #include - #include - #include - - using namespace LightGBM; - - void cuda_histogram( +/*! + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifdef USE_CUDA + +#include "cuda_kernel_launcher.h" +#include +#include +#include + +using namespace LightGBM; + +void cuda_histogram( int histogram_size, data_size_t leaf_num_data, data_size_t num_data, @@ -28,147 +32,136 @@ void* arg9, size_t exp_workgroups_per_feature) { - if (histogram_size == 16) { - if (leaf_num_data == num_data) { - if (use_all_features) { - if (!is_constant_hessian) - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + if (histogram_size == 16) { + if (leaf_num_data == num_data) { + if (use_all_features) { + if (!is_constant_hessian) + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + else + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - else { - if (!is_constant_hessian) - histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + } else { + if (!is_constant_hessian) + histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + else + histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } } - else { - if (use_all_features) { - // seems all features is always enabled, so this should be the same as fulldata - if (!is_constant_hessian) - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + } else { + if (use_all_features) { + // seems all features is always enabled, so this should be the same as fulldata + if (!is_constant_hessian) + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + else + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - else { - if (!is_constant_hessian) - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + } else { + if (!is_constant_hessian) + histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else + else histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } } - } - else if (histogram_size == 64) { - if (leaf_num_data == num_data) { - if (use_all_features) { - if (!is_constant_hessian) - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + } + } else if (histogram_size == 64) { + if (leaf_num_data == num_data) { + if (use_all_features) { + if (!is_constant_hessian) + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + else + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - else { - if (!is_constant_hessian) - histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + } else { + if (!is_constant_hessian) + histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + else + histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } } - else { - if (use_all_features) { - // seems all features is always enabled, so this should be the same as fulldata - if (!is_constant_hessian) - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + } else { + if (use_all_features) { + // seems all features is always enabled, so this should be the same as fulldata + if (!is_constant_hessian) + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + else + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - else { - if (!is_constant_hessian) - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + } else { + if (!is_constant_hessian) + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + else + histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } } - } - else { - if (leaf_num_data == num_data) { - if (use_all_features) { - if (!is_constant_hessian) - histogram256<<>>( arg0, arg1, arg2, + } + } else { + if (leaf_num_data == num_data) { + if (use_all_features) { + if (!is_constant_hessian) + histogram256<<>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256<<>>( arg0, arg1, arg2, + else + histogram256<<>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - else { - if (!is_constant_hessian) - histogram256_fulldata<<>>( arg0, arg1, arg2, + } else { + if (!is_constant_hessian) + histogram256_fulldata<<>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256_fulldata<<>>( arg0, arg1, arg2, + else + histogram256_fulldata<<>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } } - else { - if (use_all_features) { - // seems all features is always enabled, so this should be the same as fulldata - if (!is_constant_hessian) - histogram256<<>>( arg0, arg1, arg2, + } else { + if (use_all_features) { + // seems all features is always enabled, so this should be the same as fulldata + if (!is_constant_hessian) + histogram256<<>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256<<>>( arg0, arg1, arg2, + else + histogram256<<>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - else { - if (!is_constant_hessian) - histogram256<<>>( arg0, arg1, arg2, + } else { + if (!is_constant_hessian) + histogram256<<>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256<<>>( arg0, arg1, arg2, + else + histogram256<<>>( arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } } - } + } + } } #endif // USE_CUDA diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 0588ad14fe0..b61cddf4d3e 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -3,12 +3,13 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #ifdef USE_CUDA +#include "cuda_tree_learner.h" + #include #include #include -#include "cuda_tree_learner.h" #include "../io/dense_bin.hpp" #include @@ -58,8 +59,11 @@ CUDATreeLearner::CUDATreeLearner(const Config* config) :SerialTreeLearner(config) { use_bagging_ = false; nthreads_ = 0; - if (config->gpu_use_dp && USE_DP_FLOAT) Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!"); - else Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!"); + if (config->gpu_use_dp && USE_DP_FLOAT) { + Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!"); + } else { + Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!"); + } } CUDATreeLearner::~CUDATreeLearner() { @@ -67,13 +71,11 @@ CUDATreeLearner::~CUDATreeLearner() { void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { - // initialize SerialTreeLearner SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset); // some additional variables needed for GPU trainer num_feature_groups_ = train_data_->num_feature_groups(); - // LGBM_CUDA: use subset of training data for bagging is_use_subset_ = is_use_subset; @@ -107,11 +109,11 @@ union Float_t { int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) { int i; int retval = 0; - printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, (int) size); + printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, static_cast(size)); if (dp_flag) { // double precision double af, bf; - long long int ai, bi; - for (i = 0; i < (int) size; ++i) { + int64 ai, bi; + for (i = 0; i < static_cast(size); ++i) { af = GET_GRAD(h1, i); bf = GET_GRAD(h2, i); if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) { @@ -119,8 +121,8 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d ++retval; } if (const_flag) { - ai = GET_HESS(((long long int *) h1), i); - bi = GET_HESS(((long long int *) h2), i); + ai = GET_HESS(((int64 *) h1), i); + bi = GET_HESS(((int64 *) h2), i); if (ai != bi) { printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi); ++retval; @@ -137,7 +139,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d } else { // single precision float af, bf; int ai, bi; - for (i = 0; i < (int) size; ++i) { + for (i = 0; i < static_cast(size); ++i) { af = GET_GRAD(h1, i); bf = GET_GRAD(h2, i); if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) { @@ -167,7 +169,6 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d #endif int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { - // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples. // also guarantee that there are at least 2K examples per workgroup @@ -177,9 +178,9 @@ int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { double t = leaf_num_data / 1024.0; Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons" - "Best workgroup size per feature for full utilization is %d\n", (int)ceil(t), (1 << exp_workgroups_per_feature)); + "Best workgroup size per feature for full utilization is %d\n", static_cast(ceil(t)), (1 << exp_workgroups_per_feature)); - exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, (int)ceil(log((double)t)/log(2.0))); + exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast(ceil(log((double)t)/log(2.0)))); if (exp_workgroups_per_feature < 0) exp_workgroups_per_feature = 0; if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature) @@ -189,7 +190,6 @@ int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { } void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) { - // we have already copied ordered gradients, ordered hessians and indices to GPU // decide the best number of workgroups working on one feature4 tuple // set work group size based on feature size @@ -198,7 +198,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data); std::vector num_gpu_workgroups; - ThreadData *thread_data = (ThreadData*)malloc(sizeof(ThreadData) * num_gpu_); + ThreadData *thread_data = reinterpret_cast(malloc(sizeof(ThreadData) * num_gpu_)); for (int device_id = 0; device_id < num_gpu_; ++device_id) { int num_gpu_feature_groups = num_gpu_feature_groups_[device_id]; @@ -209,13 +209,13 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id])); CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))); } - //set thread_data + // set thread_data SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features, num_workgroups, exp_workgroups_per_feature); } - + for (int device_id = 0; device_id < num_gpu_; ++device_id) { - if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, (void *)(&thread_data[device_id]))) { + if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, reinterpret_cast(&thread_data[device_id]))) { fprintf(stderr, "Error in creating threads. Exiting\n"); exit(0); } @@ -231,13 +231,12 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu } for (int device_id = 0; device_id < num_gpu_; ++device_id) { - // copy the results asynchronously. Size depends on if double precision is used size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync((char*)host_histogram_outputs_ + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(reinterpret_cast(host_histogram_outputs_) + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id])); } } @@ -245,12 +244,11 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu template void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array) { - HistType* hist_outputs = (HistType*) host_histogram_outputs_; + HistType* hist_outputs = reinterpret_cast(host_histogram_outputs_); #pragma omp parallel for schedule(static, num_gpu_) for (int device_id = 0; device_id < num_gpu_; ++device_id) { - -// auto start_time = std::chrono::steady_clock::now(); + // auto start_time = std::chrono::steady_clock::now(); // when the output is ready, the computation is done CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id])); @@ -263,9 +261,9 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra continue; } int dense_group_index = dense_feature_group_map_[i]; - //auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; + // auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; auto old_histogram_array = leaf_histogram_array[dense_group_index].RawData() - kHistOffset; - int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); + int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); for (int j = 0; j < bin_size; ++j) { GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j); @@ -276,7 +274,6 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra // LGBM_CUDA void CUDATreeLearner::CountDenseFeatureGroups() { - num_dense_feature_groups_ = 0; for (int i = 0; i < num_feature_groups_; ++i) { @@ -291,7 +288,6 @@ void CUDATreeLearner::CountDenseFeatureGroups() { // LGBM_CUDA void CUDATreeLearner::prevAllocateGPUMemory() { - // how many feature-group tuples we have // leave some safe margin for prefetching // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature @@ -325,12 +321,12 @@ void CUDATreeLearner::prevAllocateGPUMemory() { #if 0 // allocate feature mask, for disabling some feature-groups' histogram calculation if (feature_masks_.data() != NULL) { - cudaPointerAttributes attributes; - cudaPointerGetAttributes (&attributes, feature_masks_.data()); - - if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { - CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data())); - } + cudaPointerAttributes attributes; + cudaPointerGetAttributes(&attributes, feature_masks_.data()); + + if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { + CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data())); + } } #endif diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 384ec57f66a..0375239049d 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -7,6 +7,9 @@ #include #include #include +#ifdef USE_CUDA +#include +#endif #include #include @@ -20,10 +23,8 @@ #include "leaf_splits.hpp" #ifdef USE_CUDA - #include -#include "cuda_kernel_launcher.h" // LGBM_CUDA -#include +#include "cuda_kernel_launcher.h" // LGBM_CUDA using namespace json11; @@ -34,7 +35,7 @@ namespace LightGBM { * \brief CUDA-based parallel learning algorithm. */ class CUDATreeLearner: public SerialTreeLearner { - public: + public: explicit CUDATreeLearner(const Config* tree_config); ~CUDATreeLearner(); // LGBM_CUDA: is_use_subset is used by CUDA only @@ -53,21 +54,20 @@ class CUDATreeLearner: public SerialTreeLearner { return; } } - use_bagging_ = false; + use_bagging_ = false; } - protected: + protected: void BeforeTrain() override; bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override; void FindBestSplits() override; void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; - private: + private: /*! \brief 4-byte feature tuple used by GPU kernels */ - //struct Feature4 { + // struct Feature4 { // uint8_t s[4]; - //}; - + // }; typedef float gpu_hist_t; /*! @@ -109,7 +109,7 @@ class CUDATreeLearner: public SerialTreeLearner { void GPUHistogram(data_size_t leaf_num_data, bool use_all_features); void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size, - int leaf_num_data, bool use_all_features, + int leaf_num_data, bool use_all_features, int num_workgroups, int exp_workgroups_per_feature) { ThreadData* td = &thread_data[device_id]; td->device_id = device_id; @@ -171,7 +171,7 @@ class CUDATreeLearner: public SerialTreeLearner { // LGBM_CUDA v5.2 bool ConstructGPUHistogramsAsync( const std::vector& is_feature_used, - const data_size_t* data_indices, data_size_t num_data); + const data_size_t* data_indices, data_size_t num_data); /*! brief Log2 of max number of workgroups per feature*/ const int kMaxLogWorkgroupsPerFeature = 10; // 2^10 @@ -215,12 +215,12 @@ class CUDATreeLearner: public SerialTreeLearner { /*! \brief Indices of all sparse feature-groups */ std::vector sparse_feature_group_map_; /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */ - //std::vector device_bin_mults_; + // std::vector device_bin_mults_; /*! \brief GPU memory object holding the training data */ - //uint8_t *device_features_; + // uint8_t *device_features_; std::vector device_features_; /*! \brief GPU memory object holding the ordered gradient */ - //score_t *device_gradients_; + // score_t *device_gradients_; std::vector device_gradients_; /*! \brief Pointer to pinned memory of ordered gradient */ void * ptr_pinned_gradients_ = nullptr; @@ -230,10 +230,10 @@ class CUDATreeLearner: public SerialTreeLearner { /*! \brief Pointer to pinned memory of ordered hessian */ void * ptr_pinned_hessians_ = nullptr; /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */ - // std::vector> feature_masks_; + // std::vector> feature_masks_; std::vector feature_masks_; /*! \brief GPU memory object holding the feature masks */ - //void *device_feature_masks_; + // void *device_feature_masks_; std::vector device_feature_masks_; /*! \brief Pointer to pinned memory of feature masks */ char* ptr_pinned_feature_masks_ = nullptr; @@ -293,7 +293,7 @@ class CUDATreeLearner: public SerialTreeLearner { namespace LightGBM { class CUDATreeLearner: public SerialTreeLearner { - public: + public: #pragma warning(disable : 4702) explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) { Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp index 5a820328ddb..3dde7f0f39b 100644 --- a/src/treelearner/feature_parallel_tree_learner.cpp +++ b/src/treelearner/feature_parallel_tree_learner.cpp @@ -19,7 +19,7 @@ template FeatureParallelTreeLearner::~FeatureParallelTreeLearner() { } -template // LGBM_CUDA +template // LGBM_CUDA void FeatureParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA rank_ = Network::rank(); diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 3fdcbca23a0..96882732a92 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -336,11 +336,10 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) { bool use_subtract = parent_leaf_histogram_array_ != nullptr; #ifdef USE_CUDA - if (LGBM_config_::current_learner == use_cpu_learner){ - SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); - } - else{ - ConstructHistograms(is_feature_used, use_subtract); + if (LGBM_config_::current_learner == use_cpu_learner) { + SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); + } else { + ConstructHistograms(is_feature_used, use_subtract); } #else ConstructHistograms(is_feature_used, use_subtract); diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 23fc75659ad..668b54592e7 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -209,7 +209,7 @@ class SerialTreeLearner: public TreeLearner { std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */ std::vector> ordered_hessians_; -#elif USE_CUDA // LGBM_CUDA +#elif USE_CUDA // LGBM_CUDA /*! \brief gradients of current iteration, ordered for cache optimized */ std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized */ From 930436c279caf5589c3c4ccbfebfac265c93344e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 16 Jun 2020 13:54:24 +0000 Subject: [PATCH 074/119] Even more lint cleanup. --- src/boosting/gbdt.cpp | 9 +- src/io/dense_bin.hpp | 2 +- src/treelearner/cuda_kernel_launcher.cu | 109 +- src/treelearner/cuda_kernel_launcher.h | 12 +- src/treelearner/cuda_tree_learner.cpp | 143 +- src/treelearner/cuda_tree_learner.h | 28 +- .../kernels/histogram_16_64_256.cu | 1544 ++++++++--------- 7 files changed, 908 insertions(+), 939 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index baeacc6beb9..b4c14a40e78 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -475,11 +475,10 @@ bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { // LGBM_CUDA: moved for overlapping data copy w/ other operations int iter_next = iter_ + 1; if (iter_next < config_->num_iterations) { + // auto start_time = std::chrono::steady_clock::now(); - // auto start_time = std::chrono::steady_clock::now(); - - // bagging logic - Bagging(iter_next); + // bagging logic + Bagging(iter_next); } } // add model @@ -958,7 +957,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { } else { bag_data_cnt_ = num_data_; if (config_->device_type == std::string("cuda")) { // LGBM_CUDA - if (tmp_subset_ == nullptr){ + if (tmp_subset_ == nullptr) { tmp_subset_.reset(new Dataset(bag_data_cnt_)); tmp_subset_->CopyFeatureMapperFrom(train_data_); size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 0eb37043842..f0405bc318e 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -466,7 +466,7 @@ class DenseBin : public Bin { private: data_size_t num_data_; #ifdef USE_CUDA - std::vector> data_; // LGBM_CUDA + std::vector> data_; // LGBM_CUDA #else std::vector> data_; #endif diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu index f8c3effeb6c..8b243200878 100644 --- a/src/treelearner/cuda_kernel_launcher.cu +++ b/src/treelearner/cuda_kernel_launcher.cu @@ -5,11 +5,9 @@ #ifdef USE_CUDA #include "cuda_kernel_launcher.h" -#include -#include #include - -using namespace LightGBM; +#include +#include void cuda_histogram( int histogram_size, @@ -31,46 +29,45 @@ void cuda_histogram( volatile int* arg8, void* arg9, size_t exp_workgroups_per_feature) { - if (histogram_size == 16) { if (leaf_num_data == num_data) { if (use_all_features) { - if (!is_constant_hessian) - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + if (!is_constant_hessian) + histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + else + histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } else { - if (!is_constant_hessian) - histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + } else { + if (!is_constant_hessian) + histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + else + histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } else { if (use_all_features) { // seems all features is always enabled, so this should be the same as fulldata - if (!is_constant_hessian) - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + if (!is_constant_hessian) + histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + else + histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { - if (!is_constant_hessian) - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + if (!is_constant_hessian) + histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16<<<16*num_workgroups, 16, 0, stream>>>( arg0, arg1, arg2, + else + histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } @@ -78,21 +75,21 @@ void cuda_histogram( } else if (histogram_size == 64) { if (leaf_num_data == num_data) { if (use_all_features) { - if (!is_constant_hessian) - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + if (!is_constant_hessian) + histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + else + histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } else { - if (!is_constant_hessian) - histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + } else { + if (!is_constant_hessian) + histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + else + histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } @@ -100,20 +97,20 @@ void cuda_histogram( if (use_all_features) { // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + else + histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { - if (!is_constant_hessian) - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + if (!is_constant_hessian) + histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64<<<4*num_workgroups, 64, 0, stream>>>( arg0, arg1, arg2, + else + histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } @@ -121,21 +118,21 @@ void cuda_histogram( } else { if (leaf_num_data == num_data) { if (use_all_features) { - if (!is_constant_hessian) - histogram256<<>>( arg0, arg1, arg2, + if (!is_constant_hessian) + histogram256<<>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256<<>>( arg0, arg1, arg2, + else + histogram256<<>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } else { - if (!is_constant_hessian) - histogram256_fulldata<<>>( arg0, arg1, arg2, + } else { + if (!is_constant_hessian) + histogram256_fulldata<<>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256_fulldata<<>>( arg0, arg1, arg2, + else + histogram256_fulldata<<>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } @@ -143,25 +140,25 @@ void cuda_histogram( if (use_all_features) { // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) - histogram256<<>>( arg0, arg1, arg2, + histogram256<<>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256<<>>( arg0, arg1, arg2, + else + histogram256<<>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { - if (!is_constant_hessian) - histogram256<<>>( arg0, arg1, arg2, + if (!is_constant_hessian) + histogram256<<>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256<<>>( arg0, arg1, arg2, + else + histogram256<<>>(arg0, arg1, arg2, reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } } } - -#endif // USE_CUDA + +#endif // USE_CUDA diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h index efe8e4b0d4a..57c5f1bfc26 100644 --- a/src/treelearner/cuda_kernel_launcher.h +++ b/src/treelearner/cuda_kernel_launcher.h @@ -7,8 +7,8 @@ #ifdef USE_CUDA // what should I include?? -#include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t #include +#include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t struct ThreadData { // device id @@ -46,10 +46,10 @@ struct ThreadData { void cuda_histogram( int histogram_size, - data_size_t leaf_num_data, + data_size_t leaf_num_data, data_size_t num_data, - bool use_all_features, - bool is_constant_hessian, + bool use_all_features, + bool is_constant_hessian, int num_workgroups, cudaStream_t stream, uint8_t* arg0, @@ -66,5 +66,5 @@ void cuda_histogram( size_t exp_workgroups_per_feature); -#endif // USE_CUDA -#endif // LGBM_KERNEL_LAUNCHER +#endif // USE_CUDA +#endif // LGBM_KERNEL_LAUNCHER diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index b61cddf4d3e..8d59e98f67f 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -5,19 +5,19 @@ #ifdef USE_CUDA #include "cuda_tree_learner.h" -#include -#include - -#include - -#include "../io/dense_bin.hpp" - #include #include #include +#include + #include +#include +#include + +#include "../io/dense_bin.hpp" + #define cudaMemcpy_DEBUG 0 // 1: DEBUG cudaMemcpy #define ResetTrainingData_DEBUG 0 // 1: Debug ResetTrainingData @@ -76,9 +76,9 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, // some additional variables needed for GPU trainer num_feature_groups_ = train_data_->num_feature_groups(); - + // LGBM_CUDA: use subset of training data for bagging - is_use_subset_ = is_use_subset; + is_use_subset_ = is_use_subset; // Initialize GPU buffers and kernels & LGBM_CUDA: get device info InitGPU(config_->num_gpu); // LGBM_CUDA @@ -121,8 +121,8 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d ++retval; } if (const_flag) { - ai = GET_HESS(((int64 *) h1), i); - bi = GET_HESS(((int64 *) h2), i); + ai = GET_HESS((reinterpret_cast(h1), i); + bi = GET_HESS((reinterpret_cast(h2), i); if (ai != bi) { printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi); ++retval; @@ -174,13 +174,13 @@ int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { double x = 256.0 / num_dense_feature_groups_; - int exp_workgroups_per_feature = (int)ceil(log2(x)); + int exp_workgroups_per_feature = static_cast(ceil(log2(x))); double t = leaf_num_data / 1024.0; Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons" "Best workgroup size per feature for full utilization is %d\n", static_cast(ceil(t)), (1 << exp_workgroups_per_feature)); - exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast(ceil(log((double)t)/log(2.0)))); + exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast(ceil(log(static_cast(t))/log(2.0)))); if (exp_workgroups_per_feature < 0) exp_workgroups_per_feature = 0; if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature) @@ -324,7 +324,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { cudaPointerAttributes attributes; cudaPointerGetAttributes(&attributes, feature_masks_.data()); - if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { + if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data())); } } @@ -343,7 +343,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { // host_size histogram outputs // host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_); - CUDASUCCESS_OR_FATAL(cudaHostAlloc( (void **)&host_histogram_outputs_, (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_),cudaHostAllocPortable)); + CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast(&host_histogram_outputs_), (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable)); // LGBM_CUDA nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_); @@ -352,7 +352,6 @@ void CUDATreeLearner::prevAllocateGPUMemory() { // LGBM_CUDA: allocate GPU memory for each GPU void CUDATreeLearner::AllocateGPUMemory() { - #pragma omp parallel for schedule(static, num_gpu_) for (int device_id = 0; device_id < num_gpu_; ++device_id) { @@ -392,7 +391,7 @@ void CUDATreeLearner::AllocateGPUMemory() { // copy indices to the device if (device_data_indices_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); + CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); } CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), (size_t) allocated_num_data_ * sizeof(data_size_t))); @@ -404,17 +403,15 @@ void CUDATreeLearner::AllocateGPUMemory() { // each work group generates a sub-histogram of dword_features_ features. if (!device_subhistograms_[device_id]) { - - // only initialize once here, as this will not need to change when ResetTrainingData() is called + // only initialize once here, as this will not need to change when ResetTrainingData() is called CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))); Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]); - } // create atomic counters for inter-group coordination CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id])); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int))); CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int), stream_[device_id])); // The output buffer is allocated to host directly, to overlap compute and data transfer @@ -425,7 +422,6 @@ void CUDATreeLearner::AllocateGPUMemory() { } void CUDATreeLearner::ResetGPUMemory() { - // clear sparse/dense maps dense_feature_group_map_.clear(); sparse_feature_group_map_.clear(); @@ -433,17 +429,16 @@ void CUDATreeLearner::ResetGPUMemory() { // LGBM_CUDA void CUDATreeLearner::copyDenseFeature() { - if (num_feature_groups_ == 0) { - LGBM_config_::current_learner = use_cpu_learner; - return; + LGBM_config_::current_learner = use_cpu_learner; + return; } // auto start_time = std::chrono::steady_clock::now(); Log::Debug("Started copying dense features from CPU to GPU"); // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes) size_t copied_feature = 0; - // set device info + // set device info int device_id = 0; uint8_t* device_features = device_features_[device_id]; CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); @@ -466,7 +461,7 @@ void CUDATreeLearner::copyDenseFeature() { copied_feature = 0; if (device_id < num_gpu_) { device_features = device_features_[device_id]; - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); } } } else { @@ -481,8 +476,7 @@ void CUDATreeLearner::copyDenseFeature() { // LGBM_CUDA: InitGPU w/ num_gpu -void CUDATreeLearner::InitGPU(int num_gpu) { - +void CUDATreeLearner::InitGPU(int num_gpu) { // Get the max bin size, used for selecting best GPU kernel max_num_bin_ = 0; @@ -511,7 +505,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { device_bin_size_ = 64; // LGBM_CUDA histogram_size_ = 64; dword_features_ = 1; // LGBM_CUDA - } else if ( max_num_bin_ <= 256) { + } else if (max_num_bin_ <= 256) { Log::Debug("device_bin_size_ = 256"); device_bin_size_ = 256; histogram_size_ = 256; @@ -530,7 +524,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { CountDenseFeatureGroups(); if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_; - + // LGBM_CUDA: initialize GPU int gpu_count; @@ -538,9 +532,9 @@ void CUDATreeLearner::InitGPU(int num_gpu) { num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu; // LGBM_CUDA: set cpu threads - cpu_threads_ = (pthread_t **)malloc(sizeof(pthread_t *)*num_gpu_); + cpu_threads_ = reinterpret_cast(malloc(sizeof(pthread_t *)*num_gpu_)); for (int device_id = 0; device_id < num_gpu_; ++device_id) { - cpu_threads_[device_id] = (pthread_t *)malloc(sizeof(pthread_t)); + cpu_threads_[device_id] = reinterpret_cast(malloc(sizeof(pthread_t))); } // LGBM_CUDA: resize device memory pointers @@ -552,7 +546,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { sync_counters_.resize(num_gpu_); device_subhistograms_.resize(num_gpu_); device_histogram_outputs_.resize(num_gpu_); - + // LGBM_CUDA: create stream & events to handle multiple GPUs preallocd_max_num_wg_.resize(num_gpu_, 1024); stream_.resize(num_gpu_); @@ -567,7 +561,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { // for debuging kernel_time_.resize(num_gpu_, 0); kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); - //kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); + // kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); for (int i = 0; i < num_gpu_; ++i) { CUDASUCCESS_OR_FATAL(cudaSetDevice(i)); @@ -596,7 +590,6 @@ void CUDATreeLearner::InitGPU(int num_gpu) { Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) { - // check if we need to recompile the GPU kernel (is_constant_hessian changed) // this should rarely occur @@ -611,21 +604,20 @@ Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, } void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) { - // LGBM_CUDA: check data size - data_size_t old_allocated_num_data = allocated_num_data_; + data_size_t old_allocated_num_data = allocated_num_data_; SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin); - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA - serial_time = std::chrono::steady_clock::now() - start_serial_time; + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + serial_time = std::chrono::steady_clock::now() - start_serial_time; #endif num_feature_groups_ = train_data_->num_feature_groups(); // GPU memory has to been reallocated because data may have been changed - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA auto start_alloc_gpu_time = std::chrono::steady_clock::now(); #endif @@ -642,29 +634,28 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_ copyDenseFeature(); - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA alloc_gpu_time = std::chrono::steady_clock::now() - start_alloc_gpu_time; #endif // setup GPU kernel arguments after we allocating all the buffers - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA auto start_set_arg_time = std::chrono::steady_clock::now(); #endif - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA set_arg_time = std::chrono::steady_clock::now() - start_set_arg_time; reset_training_data_time = std::chrono::steady_clock::now() - start_reset_training_data_time; Log::Info("reset_training_data_time: %f secs.", reset_training_data_time.count() * 1e-3); Log::Info("serial_time: %f secs.", serial_time.count() * 1e-3); Log::Info("alloc_gpu_time: %f secs.", alloc_gpu_time.count() * 1e-3); - Log::Info("set_arg_time: %f secs.", set_arg_time.count() * 1e-3); + Log::Info("set_arg_time: %f secs.", set_arg_time.count() * 1e-3); #endif } void CUDATreeLearner::BeforeTrain() { - - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA std::chrono::duration device_hessians_time = std::chrono::milliseconds(0); std::chrono::duration device_gradients_time = std::chrono::milliseconds(0); #endif @@ -674,13 +665,12 @@ void CUDATreeLearner::BeforeTrain() { #if GPU_DEBUG >= 2 printf("CUDATreeLearner::BeforeTrain() Copying initial full gradients and hessians to device\n"); #endif - + // Copy initial full hessians and gradients to GPU. // We start copying as early as possible, instead of at ConstructHistogram(). if ((hessians_ != NULL) && (gradients_ != NULL)) { if (!use_bagging_ && num_dense_feature_groups_) { - Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_); for (int device_id = 0; device_id < num_gpu_; ++device_id) { @@ -728,7 +718,6 @@ void CUDATreeLearner::BeforeTrain() { // use bagging if ((hessians_ != NULL) && (gradients_ != NULL)) { if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) { - // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram() // copy used gradients and hessians to ordered buffer @@ -737,18 +726,15 @@ void CUDATreeLearner::BeforeTrain() { // transfer the indices to GPU for (int device_id = 0; device_id < num_gpu_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); if (!is_constant_hessian_) { - - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], (void *) &(hessians_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], const_cast(reinterpret_cast(&(hessians_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); - } - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], (void *) &(gradients_[0]), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], const_cast(reinterpret_cast(&(gradients_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); } } @@ -756,7 +742,6 @@ void CUDATreeLearner::BeforeTrain() { } bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { - int smaller_leaf; data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf); @@ -798,7 +783,6 @@ bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int r bool CUDATreeLearner::ConstructGPUHistogramsAsync( const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data) { - if (num_data <= 0) { return false; } @@ -808,35 +792,32 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( Log::Debug("no dense feature groups, returning"); return false; } - + // copy data indices if it is not null if (data_indices != nullptr && num_data != num_data_) { - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], data_indices, num_data * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); - } } // converted indices in is_feature_used to feature-group indices std::vector is_feature_group_used(num_feature_groups_, 0); - #pragma omp parallel for schedule(static,1024) if (num_features_ >= 2048) + #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) for (int i = 0; i < num_features_; ++i) { - if (is_feature_used[i]) { + if (is_feature_used[i]) { int feature_group = train_data_->Feature2Group(i); // LGBM_CUDA - is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group)<=16)? 2 : 1; // LGBM_CUDA + is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group) <= 16) ? 2 : 1; // LGBM_CUDA } } // construct the feature masks for dense feature-groups int used_dense_feature_groups = 0; - #pragma omp parallel for schedule(static,1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048) + #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048) for (int i = 0; i < num_dense_feature_groups_; ++i) { if (is_feature_group_used[dense_feature_group_map_[i]]) { - //feature_masks_[i] = 1; + // feature_masks_[i] = 1; feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]]; ++used_dense_feature_groups; } else { @@ -869,17 +850,15 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( } void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) { - // LGBM_CUDA -// auto start_time = std::chrono::steady_clock::now(); + // auto start_time = std::chrono::steady_clock::now(); std::vector is_sparse_feature_used(num_features_, 0); std::vector is_dense_feature_used(num_features_, 0); - int num_dense_features=0, num_sparse_features=0; + int num_dense_features = 0, num_sparse_features = 0; #pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { - if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue; if (!is_feature_used[feature_index]) continue; if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) { @@ -892,7 +871,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } // construct smaller leaf - hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; + hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; // Check workgroups per feature4 tuple.. int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); @@ -931,7 +910,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem // #define GPU_DEBUG_COMPARE #ifdef GPU_DEBUG_COMPARE - printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n",num_dense_feature_groups_); + printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n", num_dense_feature_groups_); bool compare = true; for (int i = 0; i < num_dense_feature_groups_; ++i) { if (!feature_masks_[i]) @@ -948,8 +927,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) { continue; } - if ( num_data == num_data_ ) { - if ( is_constant_hessian_ ) { + if (num_data == num_data_) { + if (is_constant_hessian_) { printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( 0, @@ -965,7 +944,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ current_histogram); } } else { - if ( is_constant_hessian_ ) { + if (is_constant_hessian_) { printf("ConstructHistogram(): is_constant_hessian_\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), @@ -973,7 +952,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ num_data, gradients_, current_histogram); - } else { + } else { printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), @@ -984,7 +963,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } } int retval; - if ( (num_data != num_data_) && compare ) { + if ((num_data != num_data_) && compare) { retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); printf("CompareHistograms reports %d errors\n", retval); compare = false; @@ -997,7 +976,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); delete [] gpu_histogram; - //break; // LGBM_CUDA: see only first feature info + // break; // LGBM_CUDA: see only first feature info } printf("End Comparing Histogram between GPU and CPU\n"); fflush(stderr); @@ -1006,7 +985,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ #endif if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { - // construct larger leaf hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset; @@ -1018,7 +996,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // We set data_indices to null to avoid rebuilding ordered gradients/hessians if (num_sparse_features > 0) { - //train_data_->ConstructHistograms(is_sparse_feature_used, + // train_data_->ConstructHistograms(is_sparse_feature_used, // nullptr, larger_leaf_splits_->num_data_in_leaf(), // larger_leaf_splits_->leaf_index(), // ordered_bins_, gradients_, hessians_, @@ -1047,7 +1025,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } void CUDATreeLearner::FindBestSplits() { - SerialTreeLearner::FindBestSplits(); #if GPU_DEBUG >= 3 @@ -1058,7 +1035,7 @@ void CUDATreeLearner::FindBestSplits() { smaller_leaf_histogram_array_[feature_index].set_is_splittable(false); continue; } - size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; + size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd smaller leaf:\n", feature_index, bin_size); PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size); if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; } @@ -1093,4 +1070,4 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ } // namespace LightGBM #undef cudaMemcpy_DEBUG -#endif // USE_CUDA +#endif // USE_CUDA diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 0375239049d..cd7413d3a43 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -1,3 +1,7 @@ +/*! + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ #pragma once #ifndef LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ #define LGBM_TREELEARNET_CUDA_TREE_LEARNER_H_ @@ -27,7 +31,7 @@ #include "cuda_kernel_launcher.h" // LGBM_CUDA -using namespace json11; +using json11::Json; namespace LightGBM { @@ -63,6 +67,7 @@ class CUDATreeLearner: public SerialTreeLearner { void FindBestSplits() override; void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; + private: /*! \brief 4-byte feature tuple used by GPU kernels */ // struct Feature4 { @@ -76,7 +81,7 @@ class CUDATreeLearner: public SerialTreeLearner { * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature */ int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data); - + /*! * \brief Initialize GPU device * \LGBM_CUDA: param num_gpu: number of maximum gpus @@ -107,7 +112,7 @@ class CUDATreeLearner: public SerialTreeLearner { * \param use_all_features Set to true to not use feature masks, with a faster kernel */ void GPUHistogram(data_size_t leaf_num_data, bool use_all_features); - + void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size, int leaf_num_data, bool use_all_features, int num_workgroups, int exp_workgroups_per_feature) { @@ -130,7 +135,7 @@ class CUDATreeLearner: public SerialTreeLearner { td->sync_counters = sync_counters_[device_id]; td->device_histogram_outputs = device_histogram_outputs_[device_id]; td->exp_workgroups_per_feature = exp_workgroups_per_feature; - + td->kernel_start = &(kernel_start_[device_id]); td->kernel_wait_obj = &(kernel_wait_obj_[device_id]); td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]); @@ -138,14 +143,14 @@ class CUDATreeLearner: public SerialTreeLearner { size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; td->output_size = output_size; - td->host_histogram_output = (char*)host_histogram_outputs_ + host_output_offset; + td->host_histogram_output = reinterpret_cast(host_histogram_outputs_) + host_output_offset; td->histograms_wait_obj = &(histograms_wait_obj_[device_id]); } // LGBM_CUDA: thread work // typedef void * (*THREADFUNCPTR)(void *); // void* launch_gpu_kernel(void *td); - + /*! * \brief Wait for GPU kernel execution and read histogram * \param histograms Destination of histogram results from GPU. @@ -285,13 +290,14 @@ class CUDATreeLearner: public SerialTreeLearner { int allocated_num_data_; // allocated data instances pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu }; + } // namespace LightGBM -#else // USE_CUDA +#else // USE_CUDA // When GPU support is not compiled in, quit with an error message namespace LightGBM { - + class CUDATreeLearner: public SerialTreeLearner { public: #pragma warning(disable : 4702) @@ -301,7 +307,7 @@ class CUDATreeLearner: public SerialTreeLearner { } }; -} +} // namespace LightGBM -#endif // USE_CUDA -#endif // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ +#endif // USE_CUDA +#endif // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 7831159160b..64b2405a592 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -21,8 +21,7 @@ if (b == gtid && t == ltid) { \ } // atomic add for float number in local memory -inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) -{ +inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) { atomicAdd(addr, static_cast(val)); } @@ -31,13 +30,13 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) #ifdef IGNORE_INDICES #define KERNEL_NAME histogram16_fulldata #else // IGNORE_INDICES -#define KERNEL_NAME histogram16 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled -//#define KERNEL_NAME histogram16_allfeats -#endif // IGNORE_INDICES -#else // ENABLE_ALL_FEATURES +#define KERNEL_NAME histogram16 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled +// #define KERNEL_NAME histogram16_allfeats +#endif // IGNORE_INDICES +#else // ENABLE_ALL_FEATURES #error "ENABLE_ALL_FEATURES should always be 1" #define KERNEL_NAME histogram16 -#endif // ENABLE_ALL_FEATURES +#endif // ENABLE_ALL_FEATURES #define NUM_BINS 16 #define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) @@ -51,10 +50,10 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { const ushort ltid = threadIdx.x; - // TODO: try to avoid bank conflict here + // TODO(anyone): try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + uint* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); uint cont_bin; if (power_feature_workgroups != 0) { @@ -74,7 +73,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ } // skip the counters we already have - p += 3 * NUM_BINS; + p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; @@ -94,12 +93,12 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ } #if USE_CONSTANT_BUF == 1 -__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, +__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))), const data_size_t feature_size, - __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), - const data_size_t num_data, - __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), + __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), + const data_size_t num_data, + __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), #if CONST_HESSIAN == 0 __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))), #else @@ -110,269 +109,266 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, acc_type* __restrict__ hist_buf_base, const size_t power_feature_workgroups) { #else -__global__ void KERNEL_NAME(const uchar* feature_data_base, +__global__ void KERNEL_NAME(const uchar* feature_data_base, // FIXME: how to handle this __constant const uchar* __restrict__ feature_masks, const data_size_t feature_size, - const data_size_t* data_indices, - const data_size_t num_data, - const score_t* ordered_gradients, + const data_size_t* data_indices, + const data_size_t num_data, + const score_t* ordered_gradients, #if CONST_HESSIAN == 0 const score_t* ordered_hessians, #else const score_t const_hessian, #endif - char* __restrict__ output_buf, + char* __restrict__ output_buf, volatile int * sync_counters, acc_type* __restrict__ hist_buf_base, const size_t power_feature_workgroups) { #endif - // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms - // otherwise a "Misaligned Address" exception may occur - __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; - const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; - const ushort ltid = threadIdx.x; - const ushort lsize = NUM_BINS; // get_local_size(0); - const ushort group_id = blockIdx.x; - - // local memory per workgroup is 3 KB - // clear local memory - uint *ptr = (uint *) shared_array; - for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { - ptr[i] = 0; - } - __syncthreads(); - // gradient/hessian histograms - // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? - // total size: 2 * 256 * size_of(float) = 2 KB - // organization: each feature/grad/hessian is at a different bank, - // as indepedent of the feature value as possible - acc_type *gh_hist = (acc_type *)shared_array; - - // counter histogram - // total size: 256 * size_of(uint) = 1 KB - uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); - - // odd threads (1, 3, ...) compute histograms for hessians first - // even thread (0, 2, ...) compute histograms for gradients first - // etc. - uchar is_hessian_first = ltid & 1; - - ushort feature_id = group_id >> power_feature_workgroups; - - // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) - // feature_size is the number of examples per feature - const uchar *feature_data = feature_data_base + feature_id * feature_size; - - // size of threads that process this feature4 - const uint subglobal_size = lsize * (1 << power_feature_workgroups); - - // equavalent thread ID in this subgroup for this feature4 - const uint subglobal_tid = gtid - feature_id * subglobal_size; - - - data_size_t ind; - data_size_t ind_next; - #ifdef IGNORE_INDICES - ind = subglobal_tid; - #else - ind = data_indices[subglobal_tid]; - #endif - - // extract feature mask, when a byte is set to 0, that feature is disabled - uchar feature_mask = feature_masks[feature_id]; - // exit if the feature is masked - if (!feature_mask) { - return; - } else { - feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) - } - - // STAGE 1: read feature data, and gradient and hessian - // first half of the threads read feature data from global memory - // We will prefetch data into the "next" variable at the beginning of each iteration - uchar feature; - uchar feature_next; - //uint8_t bin; - ushort bin; - - feature = feature_data[ind >> feature_mask]; - if (feature_mask) { - feature = (feature >> ((ind & 1) << 2)) & 0xf; - } - bin = feature; - acc_type grad_bin = 0.0f, hess_bin = 0.0f; - acc_type *addr_bin; - - // store gradient and hessian - score_t grad, hess; - score_t grad_next, hess_next; - // LGBM_CUDA v5.1 - grad = ordered_gradients[ind]; - #if CONST_HESSIAN == 0 - hess = ordered_hessians[ind]; - #endif - - - // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 - for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { - // prefetch the next iteration variables - // we don't need bondary check because we have made the buffer large - int i_next = i + subglobal_size; - #ifdef IGNORE_INDICES - // we need to check to bounds here - ind_next = i_next < num_data ? i_next : i; - #else - ind_next = data_indices[i_next]; - #endif - - // imbGBT v5.1 - grad_next = ordered_gradients[ind_next]; - #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[ind_next]; - #endif - - // STAGE 2: accumulate gradient and hessian - if (bin != feature) { - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - - bin = feature; - grad_bin = grad; - hess_bin = hess; - } - else { - grad_bin += grad; - hess_bin += hess; - } - - // prefetch the next iteration variables - feature_next = feature_data[ind_next >> feature_mask]; - - // STAGE 3: accumulate counter - atomicAdd(cnt_hist + feature, 1); - - // STAGE 4: update next stat - grad = grad_next; - hess = hess_next; - // LGBM_CUDA: v4.2 - if (!feature_mask) { - feature = feature_next; - } else { - feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; - } - } - - - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - __syncthreads(); + // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms + // otherwise a "Misaligned Address" exception may occur + __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; + const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; + const ushort ltid = threadIdx.x; + const ushort lsize = NUM_BINS; // get_local_size(0); + const ushort group_id = blockIdx.x; + + // local memory per workgroup is 3 KB + // clear local memory + uint *ptr = reinterpret_cast(shared_array); + for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { + ptr[i] = 0; + } + __syncthreads(); + // gradient/hessian histograms + // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? + // total size: 2 * 256 * size_of(float) = 2 KB + // organization: each feature/grad/hessian is at a different bank, + // as indepedent of the feature value as possible + acc_type *gh_hist = reinterpret_cast(shared_array); + + // counter histogram + // total size: 256 * size_of(uint) = 1 KB + uint *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); + + // odd threads (1, 3, ...) compute histograms for hessians first + // even thread (0, 2, ...) compute histograms for gradients first + // etc. + uchar is_hessian_first = ltid & 1; + + ushort feature_id = group_id >> power_feature_workgroups; + + // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) + // feature_size is the number of examples per feature + const uchar *feature_data = feature_data_base + feature_id * feature_size; + + // size of threads that process this feature4 + const uint subglobal_size = lsize * (1 << power_feature_workgroups); + + // equavalent thread ID in this subgroup for this feature4 + const uint subglobal_tid = gtid - feature_id * subglobal_size; + + + data_size_t ind; + data_size_t ind_next; + #ifdef IGNORE_INDICES + ind = subglobal_tid; + #else + ind = data_indices[subglobal_tid]; + #endif + + // extract feature mask, when a byte is set to 0, that feature is disabled + uchar feature_mask = feature_masks[feature_id]; + // exit if the feature is masked + if (!feature_mask) { + return; + } else { + feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + } - #if CONST_HESSIAN == 1 - // make a final reduction - gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position - __syncthreads(); - #endif + // STAGE 1: read feature data, and gradient and hessian + // first half of the threads read feature data from global memory + // We will prefetch data into the "next" variable at the beginning of each iteration + uchar feature; + uchar feature_next; + // uint8_t bin; + ushort bin; + + feature = feature_data[ind >> feature_mask]; + if (feature_mask) { + feature = (feature >> ((ind & 1) << 2)) & 0xf; + } + bin = feature; + acc_type grad_bin = 0.0f, hess_bin = 0.0f; + acc_type *addr_bin; + + // store gradient and hessian + score_t grad, hess; + score_t grad_next, hess_next; + // LGBM_CUDA v5.1 + grad = ordered_gradients[ind]; + #if CONST_HESSIAN == 0 + hess = ordered_hessians[ind]; + #endif + + // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 + for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { + // prefetch the next iteration variables + // we don't need bondary check because we have made the buffer large + int i_next = i + subglobal_size; + #ifdef IGNORE_INDICES + // we need to check to bounds here + ind_next = i_next < num_data ? i_next : i; + #else + ind_next = data_indices[i_next]; + #endif + + // imbGBT v5.1 + grad_next = ordered_gradients[ind_next]; + #if CONST_HESSIAN == 0 + hess_next = ordered_hessians[ind_next]; + #endif + + // STAGE 2: accumulate gradient and hessian + if (bin != feature) { + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first ? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + + bin = feature; + grad_bin = grad; + hess_bin = hess; + } else { + grad_bin += grad; + hess_bin += hess; + } + + // prefetch the next iteration variables + feature_next = feature_data[ind_next >> feature_mask]; + + // STAGE 3: accumulate counter + atomicAdd(cnt_hist + feature, 1); + + // STAGE 4: update next stat + grad = grad_next; + hess = hess_next; + // LGBM_CUDA: v4.2 + if (!feature_mask) { + feature = feature_next; + } else { + feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; + } + } + + + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first ? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + __syncthreads(); + + #if CONST_HESSIAN == 1 + // make a final reduction + gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + __syncthreads(); + #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; - // write gradients and hessians - acc_type *__restrict__ ptr_f = output; - for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { - // even threads read gradients, odd threads read hessians - // FIXME: 2-way bank conflict - acc_type value = gh_hist[i]; - ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; - } - // write counts - acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i]; - ptr_i[i] = value; - } - // FIXME: is this right - __syncthreads(); - __threadfence(); - // To avoid the cost of an extra reducting kernel, we have to deal with some - // gray area in OpenCL. We want the last work group that process this feature to - // make the final reduction, and other threads will just quit. - // This requires that the results written by other workgroups available to the - // last workgroup (memory consistency) - #if NVIDIA == 1 - // this is equavalent to CUDA __threadfence(); - // ensure the writes above goes to main memory and other workgroups can see it - asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); - #else - // FIXME: how to do the above on AMD GPUs?? - // GCN ISA says that the all writes will bypass L1 cache (write through), - // however when the last thread is reading sub-histogram data we have to - // make sure that no part of data is modified in local L1 cache of other workgroups. - // Otherwise reading can be a problem (atomic operations to get consistency). - // But in our case, the sub-histogram of this workgroup cannot be in the cache - // of another workgroup, so the following trick will work just fine. - #endif - // Now, we want one workgroup to do the final reduction. - // Other workgroups processing the same feature quit. - // The is done by using an global atomic counter. - // On AMD GPUs ideally this should be done in GDS, - // but currently there is no easy way to access it via OpenCL. - uint * counter_val = cnt_hist; - // backup the old value - uint old_val = *counter_val; - if (ltid == 0) { - // all workgroups processing the same feature add this counter - *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); - } - // make sure everyone in this workgroup is here - __syncthreads(); - // everyone in this wrokgroup: if we are the last workgroup, then do reduction! - if (*counter_val == (1 << power_feature_workgroups) - 1) { - if (ltid == 0) { - sync_counters[feature_id] = 0; - } - //} - #else - } - // only 1 work group, no need to increase counter - // the reduction will become a simple copy - if (1) { - uint old_val; // dummy - #endif - // locate our feature's block in output memory - uint output_offset = (feature_id << power_feature_workgroups); - acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 3 * NUM_BINS; - // skip reading the data already in local memory - //uint skip_id = feature_id ^ output_offset; - uint skip_id = group_id - output_offset; - // locate output histogram location for this feature4 - acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - - - within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); - } + acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; + // write gradients and hessians + acc_type *__restrict__ ptr_f = output; + for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { + // even threads read gradients, odd threads read hessians + // FIXME: 2-way bank conflict + acc_type value = gh_hist[i]; + ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; + } + // write counts + acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } + // FIXME: is this right + __syncthreads(); + __threadfence(); + // To avoid the cost of an extra reducting kernel, we have to deal with some + // gray area in OpenCL. We want the last work group that process this feature to + // make the final reduction, and other threads will just quit. + // This requires that the results written by other workgroups available to the + // last workgroup (memory consistency) + #if NVIDIA == 1 + // this is equavalent to CUDA __threadfence(); + // ensure the writes above goes to main memory and other workgroups can see it + asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); + #else + // FIXME: how to do the above on AMD GPUs?? + // GCN ISA says that the all writes will bypass L1 cache (write through), + // however when the last thread is reading sub-histogram data we have to + // make sure that no part of data is modified in local L1 cache of other workgroups. + // Otherwise reading can be a problem (atomic operations to get consistency). + // But in our case, the sub-histogram of this workgroup cannot be in the cache + // of another workgroup, so the following trick will work just fine. + #endif + // Now, we want one workgroup to do the final reduction. + // Other workgroups processing the same feature quit. + // The is done by using an global atomic counter. + // On AMD GPUs ideally this should be done in GDS, + // but currently there is no easy way to access it via OpenCL. + uint * counter_val = cnt_hist; + // backup the old value + uint old_val = *counter_val; + if (ltid == 0) { + // all workgroups processing the same feature add this counter + *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); + } + // make sure everyone in this workgroup is here + __syncthreads(); + // everyone in this wrokgroup: if we are the last workgroup, then do reduction! + if (*counter_val == (1 << power_feature_workgroups) - 1) { + if (ltid == 0) { + sync_counters[feature_id] = 0; + } + // } +#else + } + // only 1 work group, no need to increase counter + // the reduction will become a simple copy + if (1) { + uint old_val; // dummy +#endif + // locate our feature's block in output memory + uint output_offset = (feature_id << power_feature_workgroups); + acc_type const * __restrict__ feature_subhists = + reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; + // skip reading the data already in local memory + // uint skip_id = feature_id ^ output_offset; + uint skip_id = group_id - output_offset; + // locate output histogram location for this feature4 + acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; + + within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast(shared_array), power_feature_workgroups); + } } // end of histogram16 stuff @@ -385,13 +381,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #ifdef IGNORE_INDICES #define KERNEL_NAME histogram64_fulldata #else // IGNORE_INDICES -#define KERNEL_NAME histogram64 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled -//#define KERNEL_NAME histogram64_allfeats -#endif // IGNORE_INDICES -#else // ENABLE_ALL_FEATURES +#define KERNEL_NAME histogram64 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled +// #define KERNEL_NAME histogram64_allfeats +#endif // IGNORE_INDICES +#else // ENABLE_ALL_FEATURES #error "ENABLE_ALL_FEATURES should always be 1" #define KERNEL_NAME histogram64 -#endif // ENABLE_ALL_FEATURES +#endif // ENABLE_ALL_FEATURES #define NUM_BINS 64 #define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) @@ -405,10 +401,10 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { const ushort ltid = threadIdx.x; - // TODO: try to avoid bank conflict here + // TODO(anyone): try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + uint* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); uint cont_bin; if (power_feature_workgroups != 0) { @@ -428,7 +424,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ } // skip the counters we already have - p += 3 * NUM_BINS; + p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; @@ -448,12 +444,12 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ } #if USE_CONSTANT_BUF == 1 -__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, +__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))), const data_size_t feature_size, - __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), - const data_size_t num_data, - __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), + __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), + const data_size_t num_data, + __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), #if CONST_HESSIAN == 0 __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))), #else @@ -464,269 +460,264 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, acc_type* __restrict__ hist_buf_base, const size_t power_feature_workgroups) { #else -__global__ void KERNEL_NAME(const uchar* feature_data_base, +__global__ void KERNEL_NAME(const uchar* feature_data_base, // FIXME: how to handle this __constant const uchar* __restrict__ feature_masks, const data_size_t feature_size, - const data_size_t* data_indices, - const data_size_t num_data, - const score_t* ordered_gradients, + const data_size_t* data_indices, + const data_size_t num_data, + const score_t* ordered_gradients, #if CONST_HESSIAN == 0 const score_t* ordered_hessians, #else const score_t const_hessian, #endif - char* __restrict__ output_buf, + char* __restrict__ output_buf, volatile int * sync_counters, acc_type* __restrict__ hist_buf_base, const size_t power_feature_workgroups) { #endif - // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms - // otherwise a "Misaligned Address" exception may occur - __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; - const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; - const ushort ltid = threadIdx.x; - const ushort lsize = NUM_BINS; // get_local_size(0); - const ushort group_id = blockIdx.x; - - // local memory per workgroup is 3 KB - // clear local memory - uint *ptr = (uint *) shared_array; - for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { - ptr[i] = 0; - } - __syncthreads(); - // gradient/hessian histograms - // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? - // total size: 2 * 256 * size_of(float) = 2 KB - // organization: each feature/grad/hessian is at a different bank, - // as indepedent of the feature value as possible - acc_type *gh_hist = (acc_type *)shared_array; - - // counter histogram - // total size: 256 * size_of(uint) = 1 KB - uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); - - // odd threads (1, 3, ...) compute histograms for hessians first - // even thread (0, 2, ...) compute histograms for gradients first - // etc. - uchar is_hessian_first = ltid & 1; - - ushort feature_id = group_id >> power_feature_workgroups; - - // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) - // feature_size is the number of examples per feature - const uchar *feature_data = feature_data_base + feature_id * feature_size; - - // size of threads that process this feature4 - const uint subglobal_size = lsize * (1 << power_feature_workgroups); - - // equavalent thread ID in this subgroup for this feature4 - const uint subglobal_tid = gtid - feature_id * subglobal_size; - - - data_size_t ind; - data_size_t ind_next; - #ifdef IGNORE_INDICES - ind = subglobal_tid; - #else - ind = data_indices[subglobal_tid]; - #endif - - // extract feature mask, when a byte is set to 0, that feature is disabled - uchar feature_mask = feature_masks[feature_id]; - // exit if the feature is masked - if (!feature_mask) { - return; - } else { - feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) - } - - // STAGE 1: read feature data, and gradient and hessian - // first half of the threads read feature data from global memory - // We will prefetch data into the "next" variable at the beginning of each iteration - uchar feature; - uchar feature_next; - //uint8_t bin; - ushort bin; - - feature = feature_data[ind >> feature_mask]; - if (feature_mask) { - feature = (feature >> ((ind & 1) << 2)) & 0xf; - } - bin = feature; - acc_type grad_bin = 0.0f, hess_bin = 0.0f; - acc_type *addr_bin; - - // store gradient and hessian - score_t grad, hess; - score_t grad_next, hess_next; - // LGBM_CUDA v5.1 - grad = ordered_gradients[ind]; - #if CONST_HESSIAN == 0 - hess = ordered_hessians[ind]; - #endif - - - // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 - for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { - // prefetch the next iteration variables - // we don't need bondary check because we have made the buffer large - int i_next = i + subglobal_size; - #ifdef IGNORE_INDICES - // we need to check to bounds here - ind_next = i_next < num_data ? i_next : i; - #else - ind_next = data_indices[i_next]; - #endif - - // imbGBT v5.1 - grad_next = ordered_gradients[ind_next]; - #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[ind_next]; - #endif - - // STAGE 2: accumulate gradient and hessian - if (bin != feature) { - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - - bin = feature; - grad_bin = grad; - hess_bin = hess; - } - else { - grad_bin += grad; - hess_bin += hess; - } - - // prefetch the next iteration variables - feature_next = feature_data[ind_next >> feature_mask]; - - // STAGE 3: accumulate counter - atomicAdd(cnt_hist + feature, 1); - - // STAGE 4: update next stat - grad = grad_next; - hess = hess_next; - // LGBM_CUDA: v4.2 - if (!feature_mask) { - feature = feature_next; - } else { - feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; - } - } - - - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - __syncthreads(); + // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms + // otherwise a "Misaligned Address" exception may occur + __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; + const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; + const ushort ltid = threadIdx.x; + const ushort lsize = NUM_BINS; // get_local_size(0); + const ushort group_id = blockIdx.x; + + // local memory per workgroup is 3 KB + // clear local memory + uint *ptr = reinterpret_cast(shared_array); + for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { + ptr[i] = 0; + } + __syncthreads(); + // gradient/hessian histograms + // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? + // total size: 2 * 256 * size_of(float) = 2 KB + // organization: each feature/grad/hessian is at a different bank, + // as indepedent of the feature value as possible + acc_type *gh_hist = reinterpret_cast(shared_array); + + // counter histogram + // total size: 256 * size_of(uint) = 1 KB + uint *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); + + // odd threads (1, 3, ...) compute histograms for hessians first + // even thread (0, 2, ...) compute histograms for gradients first + // etc. + uchar is_hessian_first = ltid & 1; + + ushort feature_id = group_id >> power_feature_workgroups; + + // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) + // feature_size is the number of examples per feature + const uchar *feature_data = feature_data_base + feature_id * feature_size; + + // size of threads that process this feature4 + const uint subglobal_size = lsize * (1 << power_feature_workgroups); + + // equavalent thread ID in this subgroup for this feature4 + const uint subglobal_tid = gtid - feature_id * subglobal_size; + + data_size_t ind; + data_size_t ind_next; + #ifdef IGNORE_INDICES + ind = subglobal_tid; + #else + ind = data_indices[subglobal_tid]; + #endif + + // extract feature mask, when a byte is set to 0, that feature is disabled + uchar feature_mask = feature_masks[feature_id]; + // exit if the feature is masked + if (!feature_mask) { + return; + } else { + feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + } - #if CONST_HESSIAN == 1 - // make a final reduction - gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position - __syncthreads(); - #endif + // STAGE 1: read feature data, and gradient and hessian + // first half of the threads read feature data from global memory + // We will prefetch data into the "next" variable at the beginning of each iteration + uchar feature; + uchar feature_next; + // uint8_t bin; + ushort bin; + + feature = feature_data[ind >> feature_mask]; + if (feature_mask) { + feature = (feature >> ((ind & 1) << 2)) & 0xf; + } + bin = feature; + acc_type grad_bin = 0.0f, hess_bin = 0.0f; + acc_type *addr_bin; + + // store gradient and hessian + score_t grad, hess; + score_t grad_next, hess_next; + // LGBM_CUDA v5.1 + grad = ordered_gradients[ind]; + #if CONST_HESSIAN == 0 + hess = ordered_hessians[ind]; + #endif + + // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 + for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { + // prefetch the next iteration variables + // we don't need bondary check because we have made the buffer large + int i_next = i + subglobal_size; + #ifdef IGNORE_INDICES + // we need to check to bounds here + ind_next = i_next < num_data ? i_next : i; + #else + ind_next = data_indices[i_next]; + #endif + + // imbGBT v5.1 + grad_next = ordered_gradients[ind_next]; + #if CONST_HESSIAN == 0 + hess_next = ordered_hessians[ind_next]; + #endif + + // STAGE 2: accumulate gradient and hessian + if (bin != feature) { + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first ? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + + bin = feature; + grad_bin = grad; + hess_bin = hess; + } else { + grad_bin += grad; + hess_bin += hess; + } + + // prefetch the next iteration variables + feature_next = feature_data[ind_next >> feature_mask]; + + // STAGE 3: accumulate counter + atomicAdd(cnt_hist + feature, 1); + + // STAGE 4: update next stat + grad = grad_next; + hess = hess_next; + // LGBM_CUDA: v4.2 + if (!feature_mask) { + feature = feature_next; + } else { + feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; + } + } + + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first ? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + __syncthreads(); + + #if CONST_HESSIAN == 1 + // make a final reduction + gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + __syncthreads(); + #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; - // write gradients and hessians - acc_type *__restrict__ ptr_f = output; - for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { - // even threads read gradients, odd threads read hessians - // FIXME: 2-way bank conflict - acc_type value = gh_hist[i]; - ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; - } - // write counts - acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i]; - ptr_i[i] = value; - } - // FIXME: is this right - __syncthreads(); - __threadfence(); - // To avoid the cost of an extra reducting kernel, we have to deal with some - // gray area in OpenCL. We want the last work group that process this feature to - // make the final reduction, and other threads will just quit. - // This requires that the results written by other workgroups available to the - // last workgroup (memory consistency) - #if NVIDIA == 1 - // this is equavalent to CUDA __threadfence(); - // ensure the writes above goes to main memory and other workgroups can see it - asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); - #else - // FIXME: how to do the above on AMD GPUs?? - // GCN ISA says that the all writes will bypass L1 cache (write through), - // however when the last thread is reading sub-histogram data we have to - // make sure that no part of data is modified in local L1 cache of other workgroups. - // Otherwise reading can be a problem (atomic operations to get consistency). - // But in our case, the sub-histogram of this workgroup cannot be in the cache - // of another workgroup, so the following trick will work just fine. - #endif - // Now, we want one workgroup to do the final reduction. - // Other workgroups processing the same feature quit. - // The is done by using an global atomic counter. - // On AMD GPUs ideally this should be done in GDS, - // but currently there is no easy way to access it via OpenCL. - uint * counter_val = cnt_hist; - // backup the old value - uint old_val = *counter_val; - if (ltid == 0) { - // all workgroups processing the same feature add this counter - *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); - } - // make sure everyone in this workgroup is here - __syncthreads(); - // everyone in this wrokgroup: if we are the last workgroup, then do reduction! - if (*counter_val == (1 << power_feature_workgroups) - 1) { - if (ltid == 0) { - sync_counters[feature_id] = 0; - } - //} - #else - } - // only 1 work group, no need to increase counter - // the reduction will become a simple copy - if (1) { - uint old_val; // dummy - #endif - // locate our feature's block in output memory - uint output_offset = (feature_id << power_feature_workgroups); - acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 3 * NUM_BINS; - // skip reading the data already in local memory - //uint skip_id = feature_id ^ output_offset; - uint skip_id = group_id - output_offset; - // locate output histogram location for this feature4 - acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - - - within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); - } + acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; + // write gradients and hessians + acc_type *__restrict__ ptr_f = output; + for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { + // even threads read gradients, odd threads read hessians + // FIXME: 2-way bank conflict + acc_type value = gh_hist[i]; + ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; + } + // write counts + acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } + // FIXME: is this right + __syncthreads(); + __threadfence(); + // To avoid the cost of an extra reducting kernel, we have to deal with some + // gray area in OpenCL. We want the last work group that process this feature to + // make the final reduction, and other threads will just quit. + // This requires that the results written by other workgroups available to the + // last workgroup (memory consistency) + #if NVIDIA == 1 + // this is equavalent to CUDA __threadfence(); + // ensure the writes above goes to main memory and other workgroups can see it + asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); + #else + // FIXME: how to do the above on AMD GPUs?? + // GCN ISA says that the all writes will bypass L1 cache (write through), + // however when the last thread is reading sub-histogram data we have to + // make sure that no part of data is modified in local L1 cache of other workgroups. + // Otherwise reading can be a problem (atomic operations to get consistency). + // But in our case, the sub-histogram of this workgroup cannot be in the cache + // of another workgroup, so the following trick will work just fine. + #endif + // Now, we want one workgroup to do the final reduction. + // Other workgroups processing the same feature quit. + // The is done by using an global atomic counter. + // On AMD GPUs ideally this should be done in GDS, + // but currently there is no easy way to access it via OpenCL. + uint * counter_val = cnt_hist; + // backup the old value + uint old_val = *counter_val; + if (ltid == 0) { + // all workgroups processing the same feature add this counter + *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); + } + // make sure everyone in this workgroup is here + __syncthreads(); + // everyone in this wrokgroup: if we are the last workgroup, then do reduction! + if (*counter_val == (1 << power_feature_workgroups) - 1) { + if (ltid == 0) { + sync_counters[feature_id] = 0; + } + // } +#else + } + // only 1 work group, no need to increase counter + // the reduction will become a simple copy + if (1) { + uint old_val; // dummy +#endif + // locate our feature's block in output memory + uint output_offset = (feature_id << power_feature_workgroups); + acc_type const * __restrict__ feature_subhists = + reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; + // skip reading the data already in local memory + // uint skip_id = feature_id ^ output_offset; + uint skip_id = group_id - output_offset; + // locate output histogram location for this feature4 + acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; + + within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast(shared_array), power_feature_workgroups); + } } // end of histogram64 stuff @@ -739,13 +730,13 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #ifdef IGNORE_INDICES #define KERNEL_NAME histogram256_fulldata #else // IGNORE_INDICES -#define KERNEL_NAME histogram256 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled -//#define KERNEL_NAME histogram256_allfeats -#endif // IGNORE_INDICES -#else // ENABLE_ALL_FEATURES +#define KERNEL_NAME histogram256 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled +// #define KERNEL_NAME histogram256_allfeats +#endif // IGNORE_INDICES +#else // ENABLE_ALL_FEATURES #error "ENABLE_ALL_FEATURES should always be 1" #define KERNEL_NAME histogram256 -#endif // ENABLE_ALL_FEATURES +#endif // ENABLE_ALL_FEATURES #define NUM_BINS 256 #define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) @@ -759,11 +750,10 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { const ushort ltid = threadIdx.x; - - // TODO: try to avoid bank conflict here + // TODO(anyone): try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = (uint *)(local_hist + 2 * NUM_BINS); + uint* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); uint cont_bin; if (power_feature_workgroups != 0) { @@ -783,7 +773,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ } // skip the counters we already have - p += 3 * NUM_BINS; + p += 3 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { grad_bin += *p; p += NUM_BINS; @@ -803,12 +793,12 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ } #if USE_CONSTANT_BUF == 1 -__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, +__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))), const data_size_t feature_size, - __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), - const data_size_t num_data, - __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), + __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), + const data_size_t num_data, + __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), #if CONST_HESSIAN == 0 __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))), #else @@ -819,264 +809,264 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, acc_type* __restrict__ hist_buf_base, const size_t power_feature_workgroups) { #else -__global__ void KERNEL_NAME(const uchar* feature_data_base, +__global__ void KERNEL_NAME(const uchar* feature_data_base, // FIXME: how to handle this __constant const uchar* __restrict__ feature_masks, const data_size_t feature_size, - const data_size_t* data_indices, - const data_size_t num_data, - const score_t* ordered_gradients, + const data_size_t* data_indices, + const data_size_t num_data, + const score_t* ordered_gradients, #if CONST_HESSIAN == 0 const score_t* ordered_hessians, #else const score_t const_hessian, #endif - char* __restrict__ output_buf, + char* __restrict__ output_buf, volatile int * sync_counters, acc_type* __restrict__ hist_buf_base, const size_t power_feature_workgroups) { #endif - // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms - // otherwise a "Misaligned Address" exception may occur - __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; - const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; - const ushort ltid = threadIdx.x; - const ushort lsize = NUM_BINS; // get_local_size(0); - const ushort group_id = blockIdx.x; - - // local memory per workgroup is 3 KB - // clear local memory - uint *ptr = (uint *) shared_array; - for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { - ptr[i] = 0; - } - __syncthreads(); - // gradient/hessian histograms - // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? - // total size: 2 * 256 * size_of(float) = 2 KB - // organization: each feature/grad/hessian is at a different bank, - // as indepedent of the feature value as possible - acc_type *gh_hist = (acc_type *)shared_array; - - // counter histogram - // total size: 256 * size_of(uint) = 1 KB - uint *cnt_hist = (uint *)(gh_hist + 2 * NUM_BINS); - - // odd threads (1, 3, ...) compute histograms for hessians first - // even thread (0, 2, ...) compute histograms for gradients first - // etc. - uchar is_hessian_first = ltid & 1; - - ushort feature_id = group_id >> power_feature_workgroups; - - // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) - // feature_size is the number of examples per feature - const uchar *feature_data = feature_data_base + feature_id * feature_size; - - // size of threads that process this feature4 - const uint subglobal_size = lsize * (1 << power_feature_workgroups); - - // equavalent thread ID in this subgroup for this feature4 - const uint subglobal_tid = gtid - feature_id * subglobal_size; - - data_size_t ind; - data_size_t ind_next; - #ifdef IGNORE_INDICES - ind = subglobal_tid; - #else - ind = data_indices[subglobal_tid]; - #endif - - // extract feature mask, when a byte is set to 0, that feature is disabled - uchar feature_mask = feature_masks[feature_id]; - // exit if the feature is masked - if (!feature_mask) { - return; - } else { - feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) - } - - // STAGE 1: read feature data, and gradient and hessian - // first half of the threads read feature data from global memory - // We will prefetch data into the "next" variable at the beginning of each iteration - uchar feature; - uchar feature_next; - //uint8_t bin; - ushort bin; - - feature = feature_data[ind >> feature_mask]; - if (feature_mask) { - feature = (feature >> ((ind & 1) << 2)) & 0xf; - } - bin = feature; - acc_type grad_bin = 0.0f, hess_bin = 0.0f; - acc_type *addr_bin; - - // store gradient and hessian - score_t grad, hess; - score_t grad_next, hess_next; - // LGBM_CUDA v5.1 - grad = ordered_gradients[ind]; - #if CONST_HESSIAN == 0 - hess = ordered_hessians[ind]; - #endif - - // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 - for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { - // prefetch the next iteration variables - // we don't need bondary check because we have made the buffer large - int i_next = i + subglobal_size; - #ifdef IGNORE_INDICES - // we need to check to bounds here - ind_next = i_next < num_data ? i_next : i; - #else - ind_next = data_indices[i_next]; - #endif - - // imbGBT v5.1 - grad_next = ordered_gradients[ind_next]; - #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[ind_next]; - #endif - // STAGE 2: accumulate gradient and hessian - if (bin != feature) { - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - - bin = feature; - grad_bin = grad; - hess_bin = hess; - } - else { - grad_bin += grad; - hess_bin += hess; - } - - // prefetch the next iteration variables - feature_next = feature_data[ind_next >> feature_mask]; - - // STAGE 3: accumulate counter - atomicAdd(cnt_hist + feature, 1); - - // STAGE 4: update next stat - grad = grad_next; - hess = hess_next; - // LGBM_CUDA: v4.2 - if (!feature_mask) { - feature = feature_next; - } else { - feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; - } - } - - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first? grad_bin : hess_bin; - - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - __syncthreads(); + // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms + // otherwise a "Misaligned Address" exception may occur + __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; + const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; + const ushort ltid = threadIdx.x; + const ushort lsize = NUM_BINS; // get_local_size(0); + const ushort group_id = blockIdx.x; + + // local memory per workgroup is 3 KB + // clear local memory + uint *ptr = reinterpret_cast(shared_array); + for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { + ptr[i] = 0; + } + __syncthreads(); + // gradient/hessian histograms + // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? + // total size: 2 * 256 * size_of(float) = 2 KB + // organization: each feature/grad/hessian is at a different bank, + // as indepedent of the feature value as possible + acc_type *gh_hist = reinterpret_cast(shared_array); + + // counter histogram + // total size: 256 * size_of(uint) = 1 KB + uint *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); + + // odd threads (1, 3, ...) compute histograms for hessians first + // even thread (0, 2, ...) compute histograms for gradients first + // etc. + uchar is_hessian_first = ltid & 1; + + ushort feature_id = group_id >> power_feature_workgroups; + + // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) + // feature_size is the number of examples per feature + const uchar *feature_data = feature_data_base + feature_id * feature_size; + + // size of threads that process this feature4 + const uint subglobal_size = lsize * (1 << power_feature_workgroups); + + // equavalent thread ID in this subgroup for this feature4 + const uint subglobal_tid = gtid - feature_id * subglobal_size; + + data_size_t ind; + data_size_t ind_next; + #ifdef IGNORE_INDICES + ind = subglobal_tid; + #else + ind = data_indices[subglobal_tid]; + #endif + + // extract feature mask, when a byte is set to 0, that feature is disabled + uchar feature_mask = feature_masks[feature_id]; + // exit if the feature is masked + if (!feature_mask) { + return; + } else { + feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + } - #if CONST_HESSIAN == 1 - // make a final reduction - gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + // STAGE 1: read feature data, and gradient and hessian + // first half of the threads read feature data from global memory + // We will prefetch data into the "next" variable at the beginning of each iteration + uchar feature; + uchar feature_next; + // uint8_t bin; + ushort bin; + + feature = feature_data[ind >> feature_mask]; + if (feature_mask) { + feature = (feature >> ((ind & 1) << 2)) & 0xf; + } + bin = feature; + acc_type grad_bin = 0.0f, hess_bin = 0.0f; + acc_type *addr_bin; + + // store gradient and hessian + score_t grad, hess; + score_t grad_next, hess_next; + // LGBM_CUDA v5.1 + grad = ordered_gradients[ind]; + #if CONST_HESSIAN == 0 + hess = ordered_hessians[ind]; + #endif + + // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 + for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { + // prefetch the next iteration variables + // we don't need bondary check because we have made the buffer large + int i_next = i + subglobal_size; + #ifdef IGNORE_INDICES + // we need to check to bounds here + ind_next = i_next < num_data ? i_next : i; + #else + ind_next = data_indices[i_next]; + #endif + + // imbGBT v5.1 + grad_next = ordered_gradients[ind_next]; + #if CONST_HESSIAN == 0 + hess_next = ordered_hessians[ind_next]; + #endif + // STAGE 2: accumulate gradient and hessian + if (bin != feature) { + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first ? grad_bin : hess_bin; + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif + + bin = feature; + grad_bin = grad; + hess_bin = hess; + } else { + grad_bin += grad; + hess_bin += hess; + } + + // prefetch the next iteration variables + feature_next = feature_data[ind_next >> feature_mask]; + + // STAGE 3: accumulate counter + atomicAdd(cnt_hist + feature, 1); + + // STAGE 4: update next stat + grad = grad_next; + hess = hess_next; + // LGBM_CUDA: v4.2 + if (!feature_mask) { + feature = feature_next; + } else { + feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; + } + } + + addr_bin = gh_hist + bin * 2 + is_hessian_first; + #if CONST_HESSIAN == 0 + acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; + atomic_local_add_f(addr_bin, acc_bin); + + addr_bin = addr_bin + 1 - 2 * is_hessian_first; + acc_bin = is_hessian_first ? grad_bin : hess_bin; + + atomic_local_add_f(addr_bin, acc_bin); + + #elif CONST_HESSIAN == 1 + atomic_local_add_f(addr_bin, grad_bin); + #endif __syncthreads(); - #endif + + #if CONST_HESSIAN == 1 + // make a final reduction + gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + __syncthreads(); + #endif #if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = ((acc_type *)output_buf) + group_id * 3 * NUM_BINS; - // write gradients and hessians - acc_type *__restrict__ ptr_f = output; - for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { - // even threads read gradients, odd threads read hessians - // FIXME: 2-way bank conflict - acc_type value = gh_hist[i]; - ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; - } - // write counts - acc_int_type *__restrict__ ptr_i = (acc_int_type *)(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i]; - ptr_i[i] = value; - } - // FIXME: is this right - __syncthreads(); - __threadfence(); - // To avoid the cost of an extra reducting kernel, we have to deal with some - // gray area in OpenCL. We want the last work group that process this feature to - // make the final reduction, and other threads will just quit. - // This requires that the results written by other workgroups available to the - // last workgroup (memory consistency) - #if NVIDIA == 1 - // this is equavalent to CUDA __threadfence(); - // ensure the writes above goes to main memory and other workgroups can see it - asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); - #else - // FIXME: how to do the above on AMD GPUs?? - // GCN ISA says that the all writes will bypass L1 cache (write through), - // however when the last thread is reading sub-histogram data we have to - // make sure that no part of data is modified in local L1 cache of other workgroups. - // Otherwise reading can be a problem (atomic operations to get consistency). - // But in our case, the sub-histogram of this workgroup cannot be in the cache - // of another workgroup, so the following trick will work just fine. - #endif - // Now, we want one workgroup to do the final reduction. - // Other workgroups processing the same feature quit. - // The is done by using an global atomic counter. - // On AMD GPUs ideally this should be done in GDS, - // but currently there is no easy way to access it via OpenCL. - uint * counter_val = cnt_hist; - // backup the old value - uint old_val = *counter_val; - if (ltid == 0) { - // all workgroups processing the same feature add this counter - *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); - } - // make sure everyone in this workgroup is here - __syncthreads(); - // everyone in this wrokgroup: if we are the last workgroup, then do reduction! - if (*counter_val == (1 << power_feature_workgroups) - 1) { - if (ltid == 0) { - sync_counters[feature_id] = 0; - } - //} - #else - } - // only 1 work group, no need to increase counter - // the reduction will become a simple copy - if (1) { - uint old_val; // dummy - #endif - // locate our feature's block in output memory - uint output_offset = (feature_id << power_feature_workgroups); - acc_type const * __restrict__ feature_subhists = - (acc_type *)output_buf + output_offset * 3 * NUM_BINS; - // skip reading the data already in local memory - //uint skip_id = feature_id ^ output_offset; - uint skip_id = group_id - output_offset; - // locate output histogram location for this feature4 - acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, (acc_type *)shared_array, power_feature_workgroups); - } + acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; + // write gradients and hessians + acc_type *__restrict__ ptr_f = output; + for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { + // even threads read gradients, odd threads read hessians + // FIXME: 2-way bank conflict + acc_type value = gh_hist[i]; + ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; + } + // write counts + acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); + for (ushort i = ltid; i < NUM_BINS; i += lsize) { + // FIXME: 2-way bank conflict + uint value = cnt_hist[i]; + ptr_i[i] = value; + } + // FIXME: is this right + __syncthreads(); + __threadfence(); + // To avoid the cost of an extra reducting kernel, we have to deal with some + // gray area in OpenCL. We want the last work group that process this feature to + // make the final reduction, and other threads will just quit. + // This requires that the results written by other workgroups available to the + // last workgroup (memory consistency) + #if NVIDIA == 1 + // this is equavalent to CUDA __threadfence(); + // ensure the writes above goes to main memory and other workgroups can see it + asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); + #else + // FIXME: how to do the above on AMD GPUs?? + // GCN ISA says that the all writes will bypass L1 cache (write through), + // however when the last thread is reading sub-histogram data we have to + // make sure that no part of data is modified in local L1 cache of other workgroups. + // Otherwise reading can be a problem (atomic operations to get consistency). + // But in our case, the sub-histogram of this workgroup cannot be in the cache + // of another workgroup, so the following trick will work just fine. + #endif + // Now, we want one workgroup to do the final reduction. + // Other workgroups processing the same feature quit. + // The is done by using an global atomic counter. + // On AMD GPUs ideally this should be done in GDS, + // but currently there is no easy way to access it via OpenCL. + uint * counter_val = cnt_hist; + // backup the old value + uint old_val = *counter_val; + if (ltid == 0) { + // all workgroups processing the same feature add this counter + *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); + } + // make sure everyone in this workgroup is here + __syncthreads(); + // everyone in this wrokgroup: if we are the last workgroup, then do reduction! + if (*counter_val == (1 << power_feature_workgroups) - 1) { + if (ltid == 0) { + sync_counters[feature_id] = 0; + } + // } +#else + } + // only 1 work group, no need to increase counter + // the reduction will become a simple copy + if (1) { + uint old_val; // dummy +#endif + // locate our feature's block in output memory + uint output_offset = (feature_id << power_feature_workgroups); + acc_type const * __restrict__ feature_subhists = + reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; + // skip reading the data already in local memory + // uint skip_id = feature_id ^ output_offset; + uint skip_id = group_id - output_offset; + // locate output histogram location for this feature4 + acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; + + within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast(shared_array), power_feature_workgroups); + } } // end of histogram256 stuff From 312733d4b7b65578e8c8ee57e852496cfe1b73b8 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 17 Jun 2020 19:32:23 +0000 Subject: [PATCH 075/119] Minor cleanup so less differences in code. --- include/LightGBM/dataset.h | 1 - src/boosting/gbdt.cpp | 2 -- src/treelearner/data_partition.hpp | 2 -- src/treelearner/serial_tree_learner.cpp | 4 ++-- 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index e4c5dc56511..0fd0dfc6d15 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -441,7 +441,6 @@ class Dataset { return ret; } - /* LGBM_CUDA void ReSize(data_size_t num_data); */ // LGBM_CUDA ReSize() returns true if resized bool ReSize(data_size_t num_data); diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index b4c14a40e78..940fef25f87 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -273,8 +273,6 @@ void GBDT::Bagging(int iter) { bag_data_cnt_, false); tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), bag_data_cnt_); - - tree_learner_->ResetTrainingData(tmp_subset_.get(), is_constant_hessian_); } } } diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp index 01c5d2606e7..7a6ac031e62 100644 --- a/src/treelearner/data_partition.hpp +++ b/src/treelearner/data_partition.hpp @@ -164,8 +164,6 @@ class DataPartition { /*! \brief used data count, used for bagging */ data_size_t used_data_count_; ParallelPartitionRunner runner_; - // LGBM_CUDA - // bool is_cuda_; }; } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 96882732a92..b7f0d47982c 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -181,7 +181,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians int init_splits = 0; bool aborted_last_force_split = false; if (!forced_split_json.is_null()) { - init_splits = ForceSplits(tree.get(), forced_split_json, &left_leaf, + init_splits = ForceSplits(tree_prt, forced_split_json, &left_leaf, &right_leaf, &cur_depth, &aborted_last_force_split); } @@ -456,7 +456,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json Json right; bool left_smaller = true; std::unordered_map forceSplitMap; - q.push(std::make_pair(forced_split_json, *left_leaf)); + q.push(std::make_pair(left, *left_leaf)); while (!q.empty()) { // before processing next node from queue, store info for current left/right leaf // store "best split" for left and right, even if they might be overwritten by forced split From 943603acde14975a2b6db36dcd93d67c3a2206ed Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 23 Jun 2020 20:03:00 +0000 Subject: [PATCH 076/119] Revert is_use_subset changes --- include/LightGBM/tree_learner.h | 3 +- src/boosting/gbdt.cpp | 180 ++---------------- src/boosting/goss.hpp | 22 +-- src/treelearner/cuda_tree_learner.cpp | 16 +- src/treelearner/cuda_tree_learner.h | 17 +- .../data_parallel_tree_learner.cpp | 4 +- .../feature_parallel_tree_learner.cpp | 6 +- src/treelearner/gpu_tree_learner.cpp | 4 +- src/treelearner/gpu_tree_learner.h | 2 +- .../kernels/histogram_16_64_256.cu | 14 -- src/treelearner/parallel_tree_learner.h | 6 +- src/treelearner/serial_tree_learner.cpp | 4 +- src/treelearner/serial_tree_learner.h | 3 +- .../voting_parallel_tree_learner.cpp | 4 +- 14 files changed, 36 insertions(+), 249 deletions(-) diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index 2ea30ac63b2..2493122e3cb 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -34,8 +34,7 @@ class TreeLearner { * \param train_data The used training data * \param is_constant_hessian True if all hessians share the same value */ - // LGBM_CUDA is_use_subset_ for CUDA - virtual void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) = 0; + virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0; virtual void ResetIsConstantHessian(bool is_constant_hessian) = 0; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 940fef25f87..39bbccabff3 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -88,6 +88,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective is_constant_hessian_ = GetIsConstHessian(objective_function); + tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get())); + + // init tree learner + tree_learner_->Init(train_data_, is_constant_hessian_); + tree_learner_->SetForcedSplit(&forced_splits_json_); + // push training metrics training_metrics_.clear(); for (const auto& metric : training_metrics) { @@ -113,30 +119,9 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective feature_infos_ = train_data_->feature_infos(); monotone_constraints_ = config->monotone_constraints; - // LGBM_CUDA - // Two key changes: position of the initializer is moved from the original code, and init() uses is_use_subset_ flag - tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get())); - // if need bagging, create buffer - // LGBM_CUDA: for CUDA implementation, this function sets up the is_use_subset_ flag as TRUE and tmp_subset_ is allocated. ResetBaggingConfig(config_.get(), true); - // init tree learner - // LGBM_CUDA do not copy feature is is_use_subset for initialization - // LGBM_CUDA initialize training data info with bagging data size (tmp_subset_) - - if (config_->device_type == std::string("cuda")) { - if (is_use_subset_) { - tree_learner_->Init(tmp_subset_.get(), is_constant_hessian_, is_use_subset_); - } else { - tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_); - } - } else { - tree_learner_->Init(train_data_, is_constant_hessian_, is_use_subset_); - } - - tree_learner_->SetForcedSplit(&forced_splits_json_); - class_need_train_ = std::vector(num_tree_per_iteration_, true); if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) { CHECK_EQ(num_tree_per_iteration_, num_class_); @@ -259,18 +244,11 @@ void GBDT::Bagging(int iter) { // set bagging data to tree learner if (!is_use_subset_) { tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); - } else { // LGBM_CUDA - // NEW get subset - bool resized = tmp_subset_->ReSize(bag_data_cnt_); - - if (resized && (config_->device_type == std::string("cuda"))) { - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - tmp_gradients_.resize(total_size); - tmp_hessians_.resize(total_size); - } - + } else { + // get subset + tmp_subset_->ReSize(bag_data_cnt_); tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), - bag_data_cnt_, false); + bag_data_cnt_, false); tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), bag_data_cnt_); } @@ -280,18 +258,13 @@ void GBDT::Bagging(int iter) { void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { Common::FunctionTimer fun_timer("GBDT::Train", global_timer); bool is_finished = false; - - // LGBM_CUDA auto start_time = std::chrono::steady_clock::now(); - for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) { is_finished = TrainOneIter(nullptr, nullptr); if (!is_finished) { is_finished = EvalAndCheckEarlyStopping(); } - auto end_time = std::chrono::steady_clock::now(); - // output used time per iteration Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration(end_time - start_time) * 1e-3, iter + 1); @@ -374,134 +347,7 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) { return 0.0f; } -// LGBM_CUDA -bool GBDT::TrainOneIterCUDA(const score_t* gradients, const score_t* hessians) { - // LGBM_CUDA invoke bagging during the first iteration - if (config_->device_type == std::string("cuda") && (iter_ == 0)) { - // auto start_time = std::chrono::steady_clock::now(); - - Bagging(iter_); - } - - std::vector init_scores(num_tree_per_iteration_, 0.0); - - // boosting first - if (gradients == nullptr || hessians == nullptr) { - for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { - init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); - } - - // LGBM_CUDA - // auto start_time = std::chrono::steady_clock::now(); - - Boosting(); - - gradients = gradients_.data(); - hessians = hessians_.data(); - } - - // LGBM_CUDA bagging logic - // Bagging(iter_); - - bool should_continue = false; - for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { - // LGBM_CUDA -// auto start_time = std::chrono::steady_clock::now(); - - const size_t offset = static_cast(cur_tree_id) * num_data_; - std::unique_ptr new_tree(new Tree(2)); - - if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) { - auto grad = gradients + offset; - auto hess = hessians + offset; - - // LGBM_CUDA - if (((tmp_gradients_.data() == 0) || (tmp_hessians_.data() == 0)) && (config_->device_type == std::string("cuda"))) { - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - tmp_gradients_.resize(total_size); - tmp_hessians_.resize(total_size); - } - - auto tmp_grad = tmp_gradients_.data(); - auto tmp_hess = tmp_hessians_.data(); - - // need to copy gradients for bagging subset. - if (is_use_subset_ && bag_data_cnt_ < num_data_) { - #pragma omp parallel for schedule(static) // LGBM_CUDA - for (int i = 0; i < bag_data_cnt_; ++i) { - tmp_grad[i] = grad[bag_data_indices_[i]]; // LGBM_CUDA - tmp_hess[i] = hess[bag_data_indices_[i]]; // LGBM_CUDA - } - } - - // LGBM_CUDA - new_tree.reset(tree_learner_->Train(tmp_grad, tmp_hess, is_constant_hessian_, forced_splits_json_)); - } - - if (new_tree->num_leaves() > 1) { - should_continue = true; - auto score_ptr = train_score_updater_->score() + offset; - auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast(label[i]) - score_ptr[i]; }; - tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter, - num_data_, bag_data_indices_.data(), bag_data_cnt_); - // shrinkage by learning rate - new_tree->Shrinkage(shrinkage_rate_); - // update score - UpdateScore(new_tree.get(), cur_tree_id); - if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) { - new_tree->AddBias(init_scores[cur_tree_id]); - } - } else { - // only add default score one-time - if (models_.size() < static_cast(num_tree_per_iteration_)) { - double output = 0.0; - if (!class_need_train_[cur_tree_id]) { - if (objective_function_ != nullptr) { - output = objective_function_->BoostFromScore(cur_tree_id); - } - } else { - output = init_scores[cur_tree_id]; - } - new_tree->AsConstantTree(output); - // updates scores - train_score_updater_->AddScore(output, cur_tree_id); - for (auto& score_updater : valid_score_updater_) { - score_updater->AddScore(output, cur_tree_id); - } - } - - // LGBM_CUDA: moved for overlapping data copy w/ other operations - int iter_next = iter_ + 1; - if (iter_next < config_->num_iterations) { - // auto start_time = std::chrono::steady_clock::now(); - - // bagging logic - Bagging(iter_next); - } - } - // add model - models_.push_back(std::move(new_tree)); - } - - if (!should_continue) { - Log::Warning("Stopped training because there are no more leaves that meet the split requirements"); - if (models_.size() > static_cast(num_tree_per_iteration_)) { - for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { - models_.pop_back(); - } - } - return true; - } - - ++iter_; - return false; -} - bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { - if (config_->device_type == std::string("cuda")) { // LGBM_CUDA - return TrainOneIterCUDA(gradients, hessians); - } - Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer); std::vector init_scores(num_tree_per_iteration_, 0.0); // boosting first @@ -929,12 +775,10 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { bagging_rands_.emplace_back(config_->bagging_seed + i); } - double average_bag_rate = - (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; is_use_subset_ = false; const int group_threshold_usesubset = 100; - if (average_bag_rate <= 0.5 - && (train_data_->num_feature_groups() < group_threshold_usesubset)) { + double average_bag_rate = (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; + if (average_bag_rate <= 0.5 && (train_data_->num_feature_groups() < group_threshold_usesubset)) { if (tmp_subset_ == nullptr || is_change_dataset) { tmp_subset_.reset(new Dataset(bag_data_cnt_)); tmp_subset_->CopyFeatureMapperFrom(train_data_); diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 2af6dee14f6..cd512e243d8 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -131,27 +131,7 @@ class GOSS: public GBDT { bag_data_cnt_ = num_data_; // not subsample for first iterations if (iter < static_cast(1.0f / config_->learning_rate)) { return; } - auto left_cnt = bagging_runner_.Run( - num_data_, - [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, - data_size_t*) { - data_size_t cur_left_count = 0; - cur_left_count = BaggingHelper(cur_start, cur_cnt, left); - return cur_left_count; - }, - bag_data_indices_.data()); - bag_data_cnt_ = left_cnt; - // set bagging data to tree learner - if (!is_use_subset_) { - tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); - } else { - // get subset - tmp_subset_->ReSize(bag_data_cnt_); - tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), - bag_data_cnt_, false); - tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), - bag_data_cnt_); - } + GBDT::Bagging(iter); } protected: diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 8d59e98f67f..5a9b9b73c4f 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -70,16 +70,13 @@ CUDATreeLearner::~CUDATreeLearner() { } -void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { +void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { // initialize SerialTreeLearner - SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset); + SerialTreeLearner::Init(train_data, is_constant_hessian); // some additional variables needed for GPU trainer num_feature_groups_ = train_data_->num_feature_groups(); - // LGBM_CUDA: use subset of training data for bagging - is_use_subset_ = is_use_subset; - // Initialize GPU buffers and kernels & LGBM_CUDA: get device info InitGPU(config_->num_gpu); // LGBM_CUDA } @@ -580,16 +577,11 @@ void CUDATreeLearner::InitGPU(int num_gpu) { AllocateGPUMemory(); - // LGBM_CUDA: copy dense feature data from cpu to gpu only when we use entire training data for training - - if (!is_use_subset_) { - Log::Debug("copyDenseFeature at the initialization\n"); - copyDenseFeature(); // LGBM_CUDA - } + copyDenseFeature(); // LGBM_CUDA } Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, const Json& forced_split_json) { + bool is_constant_hessian, Json& forced_split_json) { // check if we need to recompile the GPU kernel (is_constant_hessian changed) // this should rarely occur diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index cd7413d3a43..6a2a0e06e52 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -39,19 +39,15 @@ namespace LightGBM { * \brief CUDA-based parallel learning algorithm. */ class CUDATreeLearner: public SerialTreeLearner { - public: +public: explicit CUDATreeLearner(const Config* tree_config); ~CUDATreeLearner(); - // LGBM_CUDA: is_use_subset is used by CUDA only - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; + void Init(const Dataset* train_data, bool is_constant_hessian) override; void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; Tree* Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, const Json& forced_split_json) override; - + bool is_constant_hessian, Json& forced_split_json); void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); - // determine if we are using bagging before we construct the data partition - // thus we can start data movement to GPU earlier if (subset == nullptr && used_indices != nullptr) { if (num_data != num_data_) { use_bagging_ = true; @@ -147,10 +143,6 @@ class CUDATreeLearner: public SerialTreeLearner { td->histograms_wait_obj = &(histograms_wait_obj_[device_id]); } - // LGBM_CUDA: thread work - // typedef void * (*THREADFUNCPTR)(void *); - // void* launch_gpu_kernel(void *td); - /*! * \brief Wait for GPU kernel execution and read histogram * \param histograms Destination of histogram results from GPU. @@ -276,9 +268,6 @@ class CUDATreeLearner: public SerialTreeLearner { // cudaEvent_t features_future_; std::vector features_future_; - // LGBM_CUDA: use subset of training data for bagging - bool is_use_subset_; - // LGBM_CUDA: host-side buffer for converting feature data into featre4 data // std::vector host_vecs_; int nthreads_; // number of Feature4* vector on host4_vecs_ diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 31425c77cd3..70e6d98354f 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -20,9 +20,9 @@ DataParallelTreeLearner::~DataParallelTreeLearner() { } template -void DataParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { // LGBM_CUDA +void DataParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { // initialize SerialTreeLearner - TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA + TREELEARNER_T::Init(train_data, is_constant_hessian); // Get local rank and global machine size rank_ = Network::rank(); num_machines_ = Network::num_machines(); diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp index 3dde7f0f39b..69809e6069c 100644 --- a/src/treelearner/feature_parallel_tree_learner.cpp +++ b/src/treelearner/feature_parallel_tree_learner.cpp @@ -19,9 +19,9 @@ template FeatureParallelTreeLearner::~FeatureParallelTreeLearner() { } -template // LGBM_CUDA -void FeatureParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { - TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA +template +void FeatureParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { + TREELEARNER_T::Init(train_data, is_constant_hessian); rank_ = Network::rank(); num_machines_ = Network::num_machines(); diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index 7fb2a340a65..689314fd07a 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -36,9 +36,9 @@ GPUTreeLearner::~GPUTreeLearner() { } } -void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { +void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { // initialize SerialTreeLearner - SerialTreeLearner::Init(train_data, is_constant_hessian, is_use_subset); + SerialTreeLearner::Init(train_data, is_constant_hessian); // some additional variables needed for GPU trainer num_feature_groups_ = train_data_->num_feature_groups(); // Initialize GPU buffers and kernels diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h index 2ed29bcd1f7..8568b7de014 100644 --- a/src/treelearner/gpu_tree_learner.h +++ b/src/treelearner/gpu_tree_learner.h @@ -45,7 +45,7 @@ class GPUTreeLearner: public SerialTreeLearner { public: explicit GPUTreeLearner(const Config* tree_config); ~GPUTreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; + void Init(const Dataset* train_data, bool is_constant_hessian) override; void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; void ResetIsConstantHessian(bool is_constant_hessian) override; Tree* Train(const score_t* gradients, const score_t *hessians, diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 64b2405a592..23e9b150a1f 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -83,13 +83,8 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ } __syncthreads(); - output_buf[ltid * 2 + 0] = grad_bin; -#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; -#else - output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); -#endif } #if USE_CONSTANT_BUF == 1 @@ -434,13 +429,8 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ } __syncthreads(); - output_buf[ltid * 2 + 0] = grad_bin; -#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; -#else - output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); -#endif } #if USE_CONSTANT_BUF == 1 @@ -785,11 +775,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ __syncthreads(); output_buf[ltid * 2 + 0] = grad_bin; -#if CONST_HESSIAN == 0 output_buf[ltid * 2 + 1] = hess_bin; -#else - output_buf[ltid * 2 + 1] = as_acc_type((acc_int_type)cont_bin); -#endif } #if USE_CONSTANT_BUF == 1 diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 222955a3c94..4bb62d203a9 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -28,7 +28,7 @@ class FeatureParallelTreeLearner: public TREELEARNER_T { public: explicit FeatureParallelTreeLearner(const Config* config); ~FeatureParallelTreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA + void Init(const Dataset* train_data, bool is_constant_hessian) override; protected: void BeforeTrain() override; @@ -55,7 +55,7 @@ class DataParallelTreeLearner: public TREELEARNER_T { public: explicit DataParallelTreeLearner(const Config* config); ~DataParallelTreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA + void Init(const Dataset* train_data, bool is_constant_hessian) override; void ResetConfig(const Config* config) override; protected: @@ -109,7 +109,7 @@ class VotingParallelTreeLearner: public TREELEARNER_T { public: explicit VotingParallelTreeLearner(const Config* config); ~VotingParallelTreeLearner() { } - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; // LGBM_CUDA + void Init(const Dataset* train_data, bool is_constant_hessian) override; void ResetConfig(const Config* config) override; protected: diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index b7f0d47982c..d7e09546d55 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -25,9 +25,7 @@ SerialTreeLearner::SerialTreeLearner(const Config* config) SerialTreeLearner::~SerialTreeLearner() { } -// LGBM_CUDA -void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { - (void)is_use_subset; // UNUSED +void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { train_data_ = train_data; num_data_ = train_data_->num_data(); num_features_ = train_data_->num_features(); diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 668b54592e7..20f87bbf549 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -53,8 +53,7 @@ class SerialTreeLearner: public TreeLearner { ~SerialTreeLearner(); - // LGBM_CUDA is_use_subset is used by CUDA only - void Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) override; + void Init(const Dataset* train_data, bool is_constant_hessian) override; void ResetTrainingData(const Dataset* train_data, bool is_constant_hessian) override { diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 4b120975c26..265e94757aa 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -19,8 +19,8 @@ VotingParallelTreeLearner::VotingParallelTreeLearner(const Config } template -void VotingParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian, bool is_use_subset) { // LGBM_CUDA - TREELEARNER_T::Init(train_data, is_constant_hessian, is_use_subset); // LGBM_CUDA +void VotingParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { + TREELEARNER_T::Init(train_data, is_constant_hessian); rank_ = Network::rank(); num_machines_ = Network::num_machines(); From 1842c826a804ca6f778deb50b1a1cef06e5ed27c Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 24 Jun 2020 20:01:27 +0000 Subject: [PATCH 077/119] Another rebase from master to fix recent conflicts. --- src/treelearner/cuda_tree_learner.cpp | 4 ++-- src/treelearner/cuda_tree_learner.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 5a9b9b73c4f..cf55e35ca32 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -1016,8 +1016,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } } -void CUDATreeLearner::FindBestSplits() { - SerialTreeLearner::FindBestSplits(); +void CUDATreeLearner::FindBestSplits(const Tree* tree) { + SerialTreeLearner::FindBestSplits(tree); #if GPU_DEBUG >= 3 for (int feature_index = 0; feature_index < num_features_; ++feature_index) { diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 6a2a0e06e52..2d357228848 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -60,7 +60,7 @@ class CUDATreeLearner: public SerialTreeLearner { protected: void BeforeTrain() override; bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override; - void FindBestSplits() override; + void FindBestSplits(const Tree* tree) override; void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; From 7a796975f5788c73405f2db29bd1869fb1a56700 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 24 Jun 2020 20:11:15 +0000 Subject: [PATCH 078/119] More lint. --- src/treelearner/cuda_tree_learner.cpp | 4 ++-- src/treelearner/cuda_tree_learner.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index cf55e35ca32..93b1e9e98ac 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -577,11 +577,11 @@ void CUDATreeLearner::InitGPU(int num_gpu) { AllocateGPUMemory(); - copyDenseFeature(); // LGBM_CUDA + copyDenseFeature(); // LGBM_CUDA } Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, Json& forced_split_json) { + bool is_constant_hessian, const Json& forced_split_json) { // check if we need to recompile the GPU kernel (is_constant_hessian changed) // this should rarely occur diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 2d357228848..5e3d9e650e4 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -39,13 +39,13 @@ namespace LightGBM { * \brief CUDA-based parallel learning algorithm. */ class CUDATreeLearner: public SerialTreeLearner { -public: + public: explicit CUDATreeLearner(const Config* tree_config); ~CUDATreeLearner(); void Init(const Dataset* train_data, bool is_constant_hessian) override; void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; Tree* Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, Json& forced_split_json); + bool is_constant_hessian, const Json& forced_split_json); void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); if (subset == nullptr && used_indices != nullptr) { From f37ab3b174c77ac57b096bfeb203c12f049a1445 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 2 Jul 2020 14:20:51 +0000 Subject: [PATCH 079/119] Simple code cleanup - add & remove blank lines, revert unneccessary format changes, remove added dead code. --- include/LightGBM/bin.h | 1 - src/boosting/gbdt.cpp | 6 ++-- src/io/config.cpp | 1 - src/io/config_auto.cpp | 5 ---- src/io/dataset.cpp | 40 ++++++++----------------- src/treelearner/serial_tree_learner.cpp | 15 ++++------ 6 files changed, 21 insertions(+), 47 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 96ae6a8d641..e541e7039e9 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -308,7 +308,6 @@ class Bin { * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i] * \param out Output Result */ - virtual void ConstructHistogram( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 39bbccabff3..c351240a266 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -775,10 +775,12 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { bagging_rands_.emplace_back(config_->bagging_seed + i); } + double average_bag_rate = + (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; is_use_subset_ = false; const int group_threshold_usesubset = 100; - double average_bag_rate = (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; - if (average_bag_rate <= 0.5 && (train_data_->num_feature_groups() < group_threshold_usesubset)) { + if (average_bag_rate <= 0.5 + && (train_data_->num_feature_groups() < group_threshold_usesubset)) { if (tmp_subset_ == nullptr || is_change_dataset) { tmp_subset_.reset(new Dataset(bag_data_cnt_)); tmp_subset_->CopyFeatureMapperFrom(train_data_); diff --git a/src/io/config.cpp b/src/io/config.cpp index ed643204c91..7ee8a74487f 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -321,7 +321,6 @@ void Config::CheckParamConflict() { num_leaves = static_cast(full_num_leaves); } } - // force col-wise for gpu if (device_type == std::string("gpu")) { force_col_wise = true; diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index ba9c07cb547..6cd26c58c16 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -485,11 +485,6 @@ void Config::GetMembersFromString(const std::unordered_map #include @@ -238,17 +237,12 @@ std::vector> FindGroups( return features_in_group; } -std::vector> FastFeatureBundling(const std::vector>& bin_mappers, - int** sample_indices, - double** sample_values, - const int* num_per_col, - int num_sample_col, - data_size_t total_sample_cnt, - const std::vector& used_features, - data_size_t num_data, - bool is_sparse, - std::vector* multi_val_group, - bool is_use_gpu) { +std::vector> FastFeatureBundling( + const std::vector>& bin_mappers, + int** sample_indices, double** sample_values, const int* num_per_col, + int num_sample_col, data_size_t total_sample_cnt, + const std::vector& used_features, data_size_t num_data, + bool is_use_gpu, bool is_sparse, std::vector* multi_val_group) { Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer); std::vector feature_non_zero_cnt; feature_non_zero_cnt.reserve(used_features.size()); @@ -355,17 +349,11 @@ void Dataset::Construct(std::vector>* bin_mappers, std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); // LGBM_CUDA - features_in_group = FastFeatureBundling(*bin_mappers, - sample_non_zero_indices, - sample_values, - num_per_col, - num_sample_col, - static_cast(total_sample_cnt), - used_features, - num_data_, - io_config.is_enable_sparse, - &group_is_multi_val, - lgbm_is_gpu_used); + features_in_group = FastFeatureBundling( + *bin_mappers, sample_non_zero_indices, sample_values, num_per_col, + num_sample_col, static_cast(total_sample_cnt), + used_features, num_data_, lgbm_is_gpu_used, + io_config.is_enable_sparse, &group_is_multi_val); } num_features_ = 0; @@ -804,7 +792,6 @@ void Dataset::CopySubrow(const Dataset* fullset, CHECK_EQ(num_used_indices, num_data_); OMP_INIT_EX(); #pragma omp parallel for schedule(static) - for (int group = 0; group < num_groups_; ++group) { OMP_LOOP_EX_BEGIN(); feature_groups_[group]->CopySubrow(fullset->feature_groups_[group].get(), @@ -1282,6 +1269,7 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, } OMP_THROW_EX(); global_timer.Stop("Dataset::sparse_bin_histogram"); + global_timer.Start("Dataset::sparse_bin_histogram_merge"); int n_bin_block = 1; int bin_block_size = num_bin; @@ -1311,12 +1299,10 @@ void Dataset::ConstructHistogramsInner( data_size_t num_data, const score_t* gradients, const score_t* hessians, score_t* ordered_gradients, score_t* ordered_hessians, TrainingShareStates* share_state, hist_t* hist_data) const { - if (!share_state->is_colwise) { return ConstructHistogramsMultiVal( data_indices, num_data, gradients, hessians, share_state, hist_data); } - std::vector used_dense_group; int multi_val_groud_id = -1; used_dense_group.reserve(num_groups_); @@ -1338,7 +1324,6 @@ void Dataset::ConstructHistogramsInner( } } } - int num_used_dense_group = static_cast(used_dense_group.size()); global_timer.Start("Dataset::dense_bin_histogram"); auto ptr_ordered_grad = gradients; @@ -1361,7 +1346,6 @@ void Dataset::ConstructHistogramsInner( ptr_ordered_grad = ordered_gradients; } } - OMP_INIT_EX(); #pragma omp parallel for schedule(static) num_threads(share_state->num_threads) for (int gi = 0; gi < num_used_dense_group; ++gi) { diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index d7e09546d55..3f4e192fb7c 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -185,7 +185,6 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians for (int split = init_splits; split < config_->num_leaves - 1; ++split) { // some initial works before finding best split - if (BeforeFindBestSplit(tree_prt, left_leaf, right_leaf)) { // find best threshold for every feature FindBestSplits(tree_prt); @@ -350,7 +349,8 @@ void SerialTreeLearner::ConstructHistograms( Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer); // construct smaller leaf - hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; + hist_t* ptr_smaller_leaf_hist_data = + smaller_leaf_histogram_array_[0].RawData() - kHistOffset; train_data_->ConstructHistograms( is_feature_used, smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, @@ -359,7 +359,8 @@ void SerialTreeLearner::ConstructHistograms( if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { // construct larger leaf - hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset; + hist_t* ptr_larger_leaf_hist_data = + larger_leaf_histogram_array_[0].RawData() - kHistOffset; train_data_->ConstructHistograms( is_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, @@ -388,7 +389,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( continue; } const int tid = omp_get_thread_num(); - train_data_->FixHistogram( feature_index, smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(), @@ -422,12 +422,12 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( larger_node_used_features[feature_index], larger_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_.get(), &larger_best[tid]); + OMP_LOOP_EX_END(); } OMP_THROW_EX(); auto smaller_best_idx = ArrayArgs::ArgMax(smaller_best); int leaf = smaller_leaf_splits_->leaf_index(); - best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx]; if (larger_leaf_splits_ != nullptr && @@ -557,9 +557,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf, bool update_cnt) { Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer); - SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; - const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); if (cegb_ != nullptr) { @@ -718,7 +716,6 @@ void SerialTreeLearner::ComputeBestSplitForFeature( FeatureHistogram* histogram_array_, int feature_index, int real_fidx, bool is_feature_used, int num_data, const LeafSplits* leaf_splits, SplitInfo* best_split) { - if (!is_feature_used) { return; } @@ -733,11 +730,9 @@ void SerialTreeLearner::ComputeBestSplitForFeature( } else { parent_output = leaf_splits->weight(); } - histogram_array_[feature_index].FindBestThreshold( leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, constraints_->Get(leaf_splits->leaf_index()), parent_output, &new_split); - new_split.feature = real_fidx; if (cegb_ != nullptr) { new_split.gain -= From 9ff6a2b4e53b74a6c92b7abc2b4a1efe94640f0f Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 6 Jul 2020 13:36:31 +0000 Subject: [PATCH 080/119] Removed parameters added for CUDA and various bug fix. --- include/LightGBM/tree_learner.h | 3 +-- src/boosting/gbdt.cpp | 3 +-- src/boosting/gbdt.h | 10 +-------- src/boosting/goss.hpp | 22 +++++++++++++++++- src/boosting/rf.hpp | 2 +- src/treelearner/cuda_tree_learner.cpp | 30 +++++++++---------------- src/treelearner/cuda_tree_learner.h | 3 +-- src/treelearner/gpu_tree_learner.cpp | 5 ++--- src/treelearner/gpu_tree_learner.h | 3 +-- src/treelearner/serial_tree_learner.cpp | 18 +++++---------- src/treelearner/serial_tree_learner.h | 8 +++---- 11 files changed, 47 insertions(+), 60 deletions(-) diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index 2493122e3cb..2231f385c6e 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -56,7 +56,7 @@ class TreeLearner { * \param is_constant_hessian True if all hessians share the same value * \return A trained tree */ - virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, const Json& forced_split_json) = 0; + virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0; /*! * \brief use an existing tree to fit the new gradients and hessians. @@ -68,7 +68,6 @@ class TreeLearner { /*! * \brief Set bagging data - * \param subset subset of bagging * \param used_indices Used data indices * \param num_data Number of used data */ diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index c351240a266..8cec5796127 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -378,8 +378,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { grad = gradients_.data() + offset; hess = hessians_.data() + offset; } - // LGBM_CUDA - new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_)); + new_tree.reset(tree_learner_->Train(grad, hess)); } if (new_tree->num_leaves() > 1) { diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index d22b6687766..1b82efc0cbf 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -8,7 +8,6 @@ #include #include #include -#include #include #include @@ -22,6 +21,7 @@ #include #include +#include #include "score_updater.hpp" #ifdef USE_CUDA @@ -148,14 +148,6 @@ class GBDT : public GBDTBase { */ bool TrainOneIter(const score_t* gradients, const score_t* hessians) override; - /*! - * \brief Training logic - * \param gradients nullptr for using default objective, otherwise use self-defined boosting - * \param hessians nullptr for using default objective, otherwise use self-defined boosting - * \return True if cannot train any more - */ - bool TrainOneIterCUDA(const score_t* gradients, const score_t* hessians); // LGBM_CUDA - /*! * \brief Rollback one iteration */ diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index cd512e243d8..2af6dee14f6 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -131,7 +131,27 @@ class GOSS: public GBDT { bag_data_cnt_ = num_data_; // not subsample for first iterations if (iter < static_cast(1.0f / config_->learning_rate)) { return; } - GBDT::Bagging(iter); + auto left_cnt = bagging_runner_.Run( + num_data_, + [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, + data_size_t*) { + data_size_t cur_left_count = 0; + cur_left_count = BaggingHelper(cur_start, cur_cnt, left); + return cur_left_count; + }, + bag_data_indices_.data()); + bag_data_cnt_ = left_cnt; + // set bagging data to tree learner + if (!is_use_subset_) { + tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); + } else { + // get subset + tmp_subset_->ReSize(bag_data_cnt_); + tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), + bag_data_cnt_, false); + tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), + bag_data_cnt_); + } } protected: diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index e64bf6cb4d8..5c90202a515 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -125,7 +125,7 @@ class RF : public GBDT { hess = tmp_hess_.data(); } - new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, forced_splits_json_)); + new_tree.reset(tree_learner_->Train(grad, hess)); } if (new_tree->num_leaves() > 1) { diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 93b1e9e98ac..97010367f81 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -252,14 +252,14 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra // LGBM_CUDA } + HistType* histograms = reinterpret_cast(leaf_histogram_array[0].RawData() - kHistOffset); #pragma omp parallel for schedule(static) for (int i = 0; i < num_dense_feature_groups_; ++i) { if (!feature_masks_[i]) { continue; } int dense_group_index = dense_feature_group_map_[i]; - // auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; - auto old_histogram_array = leaf_histogram_array[dense_group_index].RawData() - kHistOffset; + auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); for (int j = 0; j < bin_size; ++j) { @@ -580,18 +580,8 @@ void CUDATreeLearner::InitGPU(int num_gpu) { copyDenseFeature(); // LGBM_CUDA } -Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, const Json& forced_split_json) { - // check if we need to recompile the GPU kernel (is_constant_hessian changed) - // this should rarely occur - - if (is_constant_hessian != is_constant_hessian_) { - Log::Debug("Recompiling GPU kernel because hessian is %sa constant now", is_constant_hessian ? "" : "not "); - is_constant_hessian_ = is_constant_hessian; - } - - Tree *ret = SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json); - +Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians) { + Tree *ret = SerialTreeLearner::Train(gradients, hessians); return ret; } @@ -666,7 +656,7 @@ void CUDATreeLearner::BeforeTrain() { Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_); for (int device_id = 0; device_id < num_gpu_; ++device_id) { - if (!is_constant_hessian_) { + if (!(share_state_->is_constant_hessian)) { Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_"); #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA @@ -721,7 +711,7 @@ void CUDATreeLearner::BeforeTrain() { CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); - if (!is_constant_hessian_) { + if (!(share_state_->is_constant_hessian)) { CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], const_cast(reinterpret_cast(&(hessians_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); } @@ -920,7 +910,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ continue; } if (num_data == num_data_) { - if (is_constant_hessian_) { + if (share_state_->is_constant_hessian) { printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( 0, @@ -936,7 +926,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ current_histogram); } } else { - if (is_constant_hessian_) { + if (share_state_->is_constant_hessian) { printf("ConstructHistogram(): is_constant_hessian_\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), @@ -956,11 +946,11 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } int retval; if ((num_data != num_data_) && compare) { - retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); + retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian); printf("CompareHistograms reports %d errors\n", retval); compare = false; } - retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, is_constant_hessian_); + retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian); if (num_data == num_data_) { printf("CompareHistograms reports %d errors\n", retval); } else { diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 5e3d9e650e4..abe1fd303c2 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -44,8 +44,7 @@ class CUDATreeLearner: public SerialTreeLearner { ~CUDATreeLearner(); void Init(const Dataset* train_data, bool is_constant_hessian) override; void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; - Tree* Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, const Json& forced_split_json); + Tree* Train(const score_t* gradients, const score_t *hessians); void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); if (subset == nullptr && used_indices != nullptr) { diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index 689314fd07a..df90aafb945 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -734,9 +734,8 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) { SetupKernelArguments(); } -Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, const Json& forced_split_json) { - return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json); +Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians) { + return SerialTreeLearner::Train(gradients, hessians); } void GPUTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) { diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h index 8568b7de014..a909c57cbad 100644 --- a/src/treelearner/gpu_tree_learner.h +++ b/src/treelearner/gpu_tree_learner.h @@ -48,8 +48,7 @@ class GPUTreeLearner: public SerialTreeLearner { void Init(const Dataset* train_data, bool is_constant_hessian) override; void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; void ResetIsConstantHessian(bool is_constant_hessian) override; - Tree* Train(const score_t* gradients, const score_t *hessians, - bool is_constant_hessian, const Json& forced_split_json) override; + Tree* Train(const score_t* gradients, const score_t *hessians) override; void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 3f4e192fb7c..65f7fa3dd07 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -148,11 +148,10 @@ void SerialTreeLearner::ResetConfig(const Config* config) { constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves)); } -Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) { +Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) { Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer); gradients_ = gradients; hessians_ = hessians; - is_constant_hessian_ = is_constant_hessian; int num_threads = OMP_NUM_THREADS(); if (share_state_->num_threads != num_threads && share_state_->num_threads > 0) { Log::Warning( @@ -176,12 +175,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians // only root leaf can be splitted on first time int right_leaf = -1; - int init_splits = 0; - bool aborted_last_force_split = false; - if (!forced_split_json.is_null()) { - init_splits = ForceSplits(tree_prt, forced_split_json, &left_leaf, - &right_leaf, &cur_depth, &aborted_last_force_split); - } + int init_splits = ForceSplits(tree_prt, &left_leaf, &right_leaf, &cur_depth); for (int split = init_splits; split < config_->num_leaves - 1; ++split) { // some initial works before finding best split @@ -438,10 +432,8 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } } -int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf, - int* right_leaf, int *cur_depth, - bool *aborted_last_force_split) { - (void)aborted_last_force_split; +int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf, + int* right_leaf, int *cur_depth) { bool abort_last_forced_split = false; if (forced_split_json_ == nullptr) { return 0; @@ -450,7 +442,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json // start at root leaf *left_leaf = 0; std::queue> q; - Json left = forced_split_json; + Json left = *forced_split_json_; Json right; bool left_smaller = true; std::unordered_map forceSplitMap; diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 20f87bbf549..946d052d86d 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -78,8 +78,7 @@ class SerialTreeLearner: public TreeLearner { } } - Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, - const Json& forced_split_json) override; + Tree* Train(const score_t* gradients, const score_t *hessians) override; Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override; @@ -162,9 +161,8 @@ class SerialTreeLearner: public TreeLearner { bool update_cnt); /* Force splits with forced_split_json dict and then return num splits forced.*/ - virtual int32_t ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf, - int* right_leaf, int* cur_depth, - bool *aborted_last_force_split); + virtual int32_t ForceSplits(Tree* tree, int* left_leaf, int* right_leaf, + int* cur_depth); /*! * \brief Get the number of data in a leaf From e0ad9d55139496de8ba880cf3642a4595d0fcf29 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 6 Jul 2020 14:10:54 +0000 Subject: [PATCH 081/119] Yet more lint and unneccessary changes. --- include/LightGBM/tree_learner.h | 2 +- src/boosting/gbdt.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index 2231f385c6e..e0fb3489057 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -53,7 +53,6 @@ class TreeLearner { * \brief training tree model on dataset * \param gradients The first order gradients * \param hessians The second order gradients - * \param is_constant_hessian True if all hessians share the same value * \return A trained tree */ virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0; @@ -68,6 +67,7 @@ class TreeLearner { /*! * \brief Set bagging data + * \param subset subset of bagging * \param used_indices Used data indices * \param num_data Number of used data */ diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 1b82efc0cbf..be0e677310b 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -21,7 +22,6 @@ #include #include -#include #include "score_updater.hpp" #ifdef USE_CUDA From 90709e6dacaa910063206190c95323e620740470 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 6 Jul 2020 14:26:28 +0000 Subject: [PATCH 082/119] Revert another change. --- src/treelearner/serial_tree_learner.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 946d052d86d..14a7b807d0c 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -161,8 +161,8 @@ class SerialTreeLearner: public TreeLearner { bool update_cnt); /* Force splits with forced_split_json dict and then return num splits forced.*/ - virtual int32_t ForceSplits(Tree* tree, int* left_leaf, int* right_leaf, - int* cur_depth); + int32_t ForceSplits(Tree* tree, int* left_leaf, int* right_leaf, + int* cur_depth); /*! * \brief Get the number of data in a leaf From 99e459b2d31bb46d7c5b81898b7266d0cedd9bcf Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 6 Jul 2020 17:29:55 +0000 Subject: [PATCH 083/119] Removal of unneccessary code. --- src/treelearner/cuda_tree_learner.cpp | 10 ++-------- src/treelearner/cuda_tree_learner.h | 2 +- src/treelearner/serial_tree_learner.cpp | 1 - src/treelearner/serial_tree_learner.h | 1 - 4 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 97010367f81..7f32c23c398 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -911,7 +911,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } if (num_data == num_data_) { if (share_state_->is_constant_hessian) { - printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian_\n"); + printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( 0, num_data, @@ -927,7 +927,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } } else { if (share_state_->is_constant_hessian) { - printf("ConstructHistogram(): is_constant_hessian_\n"); + printf("ConstructHistogram(): is_constant_hessian\n"); train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( smaller_leaf_splits_->data_indices(), 0, @@ -978,12 +978,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // We set data_indices to null to avoid rebuilding ordered gradients/hessians if (num_sparse_features > 0) { - // train_data_->ConstructHistograms(is_sparse_feature_used, - // nullptr, larger_leaf_splits_->num_data_in_leaf(), - // larger_leaf_splits_->leaf_index(), - // ordered_bins_, gradients_, hessians_, - // ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, - // ptr_larger_leaf_hist_data); train_data_->ConstructHistograms(is_sparse_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index abe1fd303c2..46e31985a5d 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -117,7 +117,7 @@ class CUDATreeLearner: public SerialTreeLearner { td->leaf_num_data = leaf_num_data; td->num_data = num_data_; td->use_all_features = use_all_features; - td->is_constant_hessian = is_constant_hessian_; + td->is_constant_hessian = share_state_->is_constant_hessian; td->num_workgroups = num_workgroups; td->stream = stream_[device_id]; td->device_features = device_features_[device_id]; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 65f7fa3dd07..d076e4afd2f 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -29,7 +29,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian train_data_ = train_data; num_data_ = train_data_->num_data(); num_features_ = train_data_->num_features(); - is_constant_hessian_ = is_constant_hessian; int max_cache_size = 0; // Get the max size of pool if (config_->histogram_pool_size <= 0) { diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 14a7b807d0c..79882ded79e 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -225,7 +225,6 @@ class SerialTreeLearner: public TreeLearner { const Json* forced_split_json_; std::unique_ptr share_state_; std::unique_ptr cegb_; - bool is_constant_hessian_; }; inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const { From f40d77b7fea05d428d5f9aafe269d53ebf9548ca Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 7 Jul 2020 13:29:57 +0000 Subject: [PATCH 084/119] temporary appveyor.yml for building and testing --- appveyor.yml | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 appveyor.yml diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 00000000000..b4c0131a9af --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,50 @@ +version: 2.3.2.{build} + +image: Visual Studio 2015 +platform: x64 +configuration: # a trick to construct a build matrix with multiple Python versions + - 3.7 + +# only build pull requests and +# commits to 'cuda' +branches: + only: + - cuda + +environment: + matrix: + - COMPILER: MSVC + TASK: python + - COMPILER: MINGW + TASK: python + +clone_depth: 5 + +init: + - ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) + +install: + - git submodule update --init --recursive # get `compute` folder + - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% # delete sh.exe from PATH (mingw32-make fix) + - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH% + - set PYTHON_VERSION=%CONFIGURATION% + - set CONDA_ENV="test-env" + - ps: >- + switch ($env:PYTHON_VERSION) { + "2.7" {$env:MINICONDA = "C:\Miniconda-x64"} + "3.5" {$env:MINICONDA = "C:\Miniconda35-x64"} + "3.6" {$env:MINICONDA = "C:\Miniconda36-x64"} + "3.7" {$env:MINICONDA = "C:\Miniconda37-x64"} + default {$env:MINICONDA = "C:\Miniconda37-x64"} + } + $env:PATH="$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH" + - ps: $env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim() + +build: false + +test_script: + - conda init powershell + - powershell.exe -ExecutionPolicy Bypass -File %APPVEYOR_BUILD_FOLDER%\.ci\test_windows.ps1 + +on_finish: + - ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) From d900b6484159f315b2fb86feb2f71e9280fe95a6 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 7 Jul 2020 16:54:03 +0000 Subject: [PATCH 085/119] Remove return value in ReSize --- include/LightGBM/dataset.h | 4 +--- src/io/dataset.cpp | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 0fd0dfc6d15..d86c0e3d7cb 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -440,9 +440,7 @@ class Dataset { } return ret; } - - // LGBM_CUDA ReSize() returns true if resized - bool ReSize(data_size_t num_data); + void ReSize(data_size_t num_data); void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 054c17e0a01..1b94e232802 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -771,8 +771,7 @@ void Dataset::CreateValid(const Dataset* dataset) { forced_bin_bounds_ = dataset->forced_bin_bounds_; } -// LGBM_CUDA Resize() returns boolean -bool Dataset::ReSize(data_size_t num_data) { +void Dataset::ReSize(data_size_t num_data) { if (num_data_ != num_data) { num_data_ = num_data; OMP_INIT_EX(); From 361720b30cf6a31466632a73e43f69be5329a915 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 7 Jul 2020 17:58:47 +0000 Subject: [PATCH 086/119] Removal of unused variables. --- src/boosting/gbdt.cpp | 3 --- src/boosting/gbdt.h | 4 ---- 2 files changed, 7 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 8cec5796127..9473d2c0ebd 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -803,9 +803,6 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { if (tmp_subset_ == nullptr) { tmp_subset_.reset(new Dataset(bag_data_cnt_)); tmp_subset_->CopyFeatureMapperFrom(train_data_); - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - tmp_gradients_.resize(total_size); - tmp_hessians_.resize(total_size); is_use_subset_ = false; bag_data_indices_.clear(); } diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index be0e677310b..583a680fe51 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -471,17 +471,13 @@ class GBDT : public GBDTBase { #ifdef USE_CUDA /*! \brief First order derivative of training data */ std::vector> gradients_; // LGBM_CUDA - std::vector> tmp_gradients_; // LGBM_CUDA /*! \brief Second order derivative of training data */ std::vector> hessians_; // LGBM_CUDA - std::vector> tmp_hessians_; // LGBM_CUDA #else /*! \brief First order derivative of training data */ std::vector> gradients_; - std::vector> tmp_gradients_; /*! \brief Second order derivative of training data */ std::vector> hessians_; - std::vector> tmp_hessians_; #endif /*! \brief Store the indices of in-bag data */ From a8b42459978d29d6edb639a69eb0865dae428c43 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 13 Jul 2020 17:36:33 +0000 Subject: [PATCH 087/119] Code cleanup from reviewers suggestions. --- CMakeLists.txt | 2 +- include/LightGBM/application.h | 1 - include/LightGBM/bin.h | 1 - include/LightGBM/config.h | 4 +--- include/LightGBM/cuda/cuda_utils.h | 2 -- include/LightGBM/cuda/vector_cudahost.h | 4 ++-- include/LightGBM/dataset.h | 2 -- include/LightGBM/feature_group.h | 2 -- python-package/setup.py | 15 ++------------- src/application/application.cpp | 6 +----- src/boosting/gbdt.cpp | 4 +--- src/boosting/gbdt.h | 9 +++------ src/c_api.cpp | 6 +----- src/io/config.cpp | 4 +--- src/io/config_auto.cpp | 7 ++----- src/io/dataset.cpp | 8 ++------ src/io/dense_bin.hpp | 11 +++-------- src/io/sparse_bin.hpp | 1 - src/main.cpp | 7 ------- src/treelearner/data_parallel_tree_learner.cpp | 2 +- src/treelearner/feature_parallel_tree_learner.cpp | 2 +- src/treelearner/tree_learner.cpp | 4 ++-- src/treelearner/voting_parallel_tree_learner.cpp | 2 +- 23 files changed, 25 insertions(+), 81 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e237da69c9..aae29463f94 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ OPTION(USE_SWIG "Enable SWIG to generate Java API" OFF) OPTION(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF) OPTION(USE_R35 "Set to ON if your R version is not earlier than 3.5" OFF) OPTION(USE_TIMETAG "Set to ON to output time costs" OFF) -OPTION(USE_CUDA "Enable CUDA-accelerated training" OFF) +OPTION(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF) OPTION(USE_DEBUG "Set to ON for Debug mode" OFF) OPTION(BUILD_FOR_R "Set to ON if building lib_lightgbm for use with the R package" OFF) diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h index 7ce8956a555..3fda4a1c32e 100644 --- a/include/LightGBM/application.h +++ b/include/LightGBM/application.h @@ -36,7 +36,6 @@ class Application { /*! \brief To call this function to run application*/ inline void Run(); - // LGBM_CUDA /*! \brief call to get configuration */ Config GetConfig() {return config_ ;} diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index e541e7039e9..fcf8f1add47 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -288,7 +288,6 @@ class Bin { /*! \brief Number of all data */ virtual data_size_t num_data() const = 0; - // LGBM_CUDA /*! \brief Get data pointer */ virtual void* get_data() = 0; diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 162c7583dc7..36219675da5 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -954,11 +954,9 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) bool gpu_use_dp = false; -#ifdef USE_CUDA - // desc = number of gpus (CUDA implementation only) LGBM_CUDA + // desc = number of gpus (CUDA implementation only) // desc = default value is 1 int num_gpu = 1; -#endif #pragma endregion diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 2fb45384f0c..af787315559 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -5,8 +5,6 @@ #ifndef LGBM_CUDA_UTILS_H #define LGBM_CUDA_UTILS_H -// LGBM_CUDA - #ifdef USE_CUDA #include diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 41a27c349bd..d73fabe25fa 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -5,12 +5,12 @@ #ifndef LGBM_CUDA_VECTOR_CH_H #define LGBM_CUDA_VECTOR_CH_H +#ifdef USE_CUDA #include #include +#endif #include -// LGBM_CUDA - namespace LightGBM { #define lgbm_device_cpu 0 diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index d86c0e3d7cb..0f683b9817c 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -589,12 +589,10 @@ class Dataset { return feature_groups_[i]->is_multi_val_; } - // LGBM_CUDA inline size_t FeatureGroupSizesInByte(int group) const { return feature_groups_[group]->FeatureGroupSizesInByte(); } - // LGBM_CUDA inline void* FeatureGroupData(int group) const { return feature_groups_[group]->FeatureGroupData(); } diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index d949beec20e..2e0db94f19c 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -228,12 +228,10 @@ class FeatureGroup { return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); } - // LGBM_CUDA inline size_t FeatureGroupSizesInByte() { return bin_data_->SizesInByte(); } - // LGBM_CUDA inline void* FeatureGroupData() { return bin_data_->get_data(); } diff --git a/python-package/setup.py b/python-package/setup.py index 1e0500f180c..36abaec4a96 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -90,7 +90,6 @@ def compile_cpp(use_mingw=False, use_gpu=False, use_cuda=False, use_mpi=False, use_hdfs=False, boost_root=None, boost_dir=None, boost_include_dir=None, boost_librarydir=None, opencl_include_dir=None, opencl_library=None, - openmp_include_dir=None, openmp_library=None, nomp=False, bit32=False): if os.path.exists(os.path.join(CURRENT_DIR, "build_cpp")): @@ -117,10 +116,6 @@ def compile_cpp(use_mingw=False, use_gpu=False, use_cuda=False, use_mpi=False, cmake_cmd.append("-DOpenCL_LIBRARY={0}".format(opencl_library)) elif use_cuda: cmake_cmd.append("-DUSE_CUDA=ON") - if openmp_include_dir: - cmake_cmd.append("-DOpenMP_INCLUDE_DIR={0}".format(openmp_include_dir)) - if openmp_library: - cmake_cmd.append("-DOpenMP_LIBRARY={0}".format(openmp_library)) if use_mpi: cmake_cmd.append("-DUSE_MPI=ON") if nomp: @@ -205,9 +200,7 @@ class CustomInstall(install): ('boost-include-dir=', None, 'Directory containing Boost headers'), ('boost-librarydir=', None, 'Preferred Boost library directory'), ('opencl-include-dir=', None, 'OpenCL include directory'), - ('opencl-library=', None, 'Path to OpenCL library'), - ('openmp-include-dir=', None, 'OpenMP include directory'), - ('openmp-library=', None, 'Path to OpenMP library') + ('opencl-library=', None, 'Path to OpenCL library') ] def initialize_options(self): @@ -221,12 +214,9 @@ def initialize_options(self): self.boost_librarydir = None self.opencl_include_dir = None self.opencl_library = None - self.openmp_include_dir = None - self.openmp_library = None self.mpi = 0 self.hdfs = 0 - # self.precompile = 0 # TODO: revert this - self.precompile = 1 + self.precompile = 0 self.nomp = 0 self.bit32 = 0 @@ -245,7 +235,6 @@ def run(self): use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir, boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir, opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library, - openmp_include_dir=self.openmp_include_dir, openmp_library=self.openmp_library, nomp=self.nomp, bit32=self.bit32) install.run(self) if os.path.isfile(LOG_PATH): diff --git a/src/application/application.cpp b/src/application/application.cpp index 5c61b323654..e88f5c86188 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -25,10 +26,6 @@ #include "predictor.hpp" -#ifdef USE_CUDA -#include -#endif - namespace LightGBM { Common::Timer global_timer; @@ -43,7 +40,6 @@ Application::Application(int argc, char** argv) { Log::Fatal("No training/prediction data, application quit"); } -// LGBM_CUDA #ifdef USE_CUDA if (config_.device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 9473d2c0ebd..467ab922e35 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -63,10 +63,8 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective es_first_metric_only_ = config_->first_metric_only; shrinkage_rate_ = config_->learning_rate; -// LGBM_CUDA #ifdef USE_CUDA if (config_->device_type == std::string("cuda")) { - // LGBM_config_::current_device = lgbm_device_cuda; moved to application.cpp LGBM_config_::current_learner = use_cuda_learner; } #endif @@ -799,7 +797,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { } } else { bag_data_cnt_ = num_data_; - if (config_->device_type == std::string("cuda")) { // LGBM_CUDA + if (config_->device_type == std::string("cuda")) { if (tmp_subset_ == nullptr) { tmp_subset_.reset(new Dataset(bag_data_cnt_)); tmp_subset_->CopyFeatureMapperFrom(train_data_); diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 583a680fe51..9567786769a 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -24,10 +25,6 @@ #include "score_updater.hpp" -#ifdef USE_CUDA -#include // LGBM_CUDA -#endif - namespace LightGBM { using json11::Json; @@ -470,9 +467,9 @@ class GBDT : public GBDTBase { #ifdef USE_CUDA /*! \brief First order derivative of training data */ - std::vector> gradients_; // LGBM_CUDA + std::vector> gradients_; /*! \brief Second order derivative of training data */ - std::vector> hessians_; // LGBM_CUDA + std::vector> hessians_; #else /*! \brief First order derivative of training data */ std::vector> gradients_; diff --git a/src/c_api.cpp b/src/c_api.cpp index 6cdebc34aed..050e605b268 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -28,10 +29,6 @@ #include "application/predictor.hpp" -#ifdef USE_CUDA -#include -#endif - namespace LightGBM { inline int LGBM_APIHandleException(const std::exception& ex) { @@ -43,7 +40,6 @@ inline int LGBM_APIHandleException(const std::string& ex) { return -1; } -// LGBM_CUDA inline void AdditionalConfig(Config *config) { #ifdef USE_CUDA if (config->device_type == std::string("cuda")) { diff --git a/src/io/config.cpp b/src/io/config.cpp index 7ee8a74487f..b354bf10f03 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -126,7 +126,7 @@ void GetDeviceType(const std::unordered_map& params, s *device_type = "cpu"; } else if (value == std::string("gpu")) { *device_type = "gpu"; - } else if (value == std::string("cuda")) { // LGBM_CUDA + } else if (value == std::string("cuda")) { *device_type = "cuda"; } else { Log::Fatal("Unknown device type %s", value.c_str()); @@ -327,13 +327,11 @@ void Config::CheckParamConflict() { force_row_wise = false; } -#ifdef USE_CUDA // force col-wise for CUDA if (device_type == std::string("cuda")) { force_col_wise = true; force_row_wise = false; } -#endif // min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated // the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 6cd26c58c16..d0bb97e1942 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -294,9 +294,7 @@ const std::unordered_set& Config::parameter_set() { "gpu_platform_id", "gpu_device_id", "gpu_use_dp", -#ifdef USE_CUDA - "num_gpu", /* LGBM_CUDA */ -#endif + "num_gpu", }); return params; } @@ -610,11 +608,10 @@ void Config::GetMembersFromString(const std::unordered_map #include #include +#include #include #include @@ -16,10 +17,6 @@ #include #include -#ifdef USE_CUDA -#include -#endif - namespace LightGBM { const char* Dataset::binary_file_token = @@ -339,7 +336,6 @@ void Dataset::Construct(std::vector>* bin_mappers, } auto features_in_group = NoGroup(used_features); -// LGBM_CUDA #ifdef USE_CUDA if (io_config.device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; @@ -348,7 +344,7 @@ void Dataset::Construct(std::vector>* bin_mappers, std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { - bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); // LGBM_CUDA + bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); features_in_group = FastFeatureBundling( *bin_mappers, sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast(total_sample_cnt), diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index f0405bc318e..48f0c4dc587 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -7,17 +7,13 @@ #define LIGHTGBM_IO_DENSE_BIN_HPP_ #include +#include +#include #include #include #include -#ifdef USE_CUDA -#include // LGBM_CUDA -#endif - -#include // LGBM_CUDA - namespace LightGBM { template @@ -368,7 +364,6 @@ class DenseBin : public Bin { data_size_t num_data() const override { return num_data_; } - // LGBM_CUDA void* get_data() override { return data_.data(); } void FinishLoad() override { @@ -466,7 +461,7 @@ class DenseBin : public Bin { private: data_size_t num_data_; #ifdef USE_CUDA - std::vector> data_; // LGBM_CUDA + std::vector> data_; #else std::vector> data_; #endif diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index c56cd6da99d..7476f9a0c24 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -408,7 +408,6 @@ class SparseBin : public Bin { data_size_t num_data() const override { return num_data_; } - // LGBM_CUDA void* get_data() override { return nullptr; } void FinishLoad() override { diff --git a/src/main.cpp b/src/main.cpp index ef277ac0c1f..8034da82681 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,10 +11,6 @@ int main(int argc, char** argv) { bool success = false; try { - // LGBM_CUDA - std::chrono::duration main_time; - auto start_main_time = std::chrono::steady_clock::now(); - LightGBM::Application app(argc, argv); app.Run(); @@ -22,9 +18,6 @@ int main(int argc, char** argv) { LightGBM::Linkers::MpiFinalizeIfIsParallel(); #endif - // LGBM_CUDA - main_time = std::chrono::steady_clock::now() - start_main_time; - LightGBM::Log::Info("main::main time: %f sec", main_time * 1e-3); success = true; } catch (const std::exception& ex) { diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 70e6d98354f..30d8df84acf 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -256,7 +256,7 @@ void DataParallelTreeLearner::Split(Tree* tree, int best_Leaf, in } // instantiate template classes, otherwise linker cannot find the code -template class DataParallelTreeLearner; // LGBM_CUDA +template class DataParallelTreeLearner; template class DataParallelTreeLearner; template class DataParallelTreeLearner; diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp index 69809e6069c..f4edfe03dc1 100644 --- a/src/treelearner/feature_parallel_tree_learner.cpp +++ b/src/treelearner/feature_parallel_tree_learner.cpp @@ -77,7 +77,7 @@ void FeatureParallelTreeLearner::FindBestSplitsFromHistograms( } // instantiate template classes, otherwise linker cannot find the code -template class FeatureParallelTreeLearner; // LGBM_CUDA +template class FeatureParallelTreeLearner; template class FeatureParallelTreeLearner; template class FeatureParallelTreeLearner; } // namespace LightGBM diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index d47b469f950..63ca1b2de83 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -5,7 +5,7 @@ #include #include "gpu_tree_learner.h" -#include "cuda_tree_learner.h" // LGBM_CUDA +#include "cuda_tree_learner.h" #include "parallel_tree_learner.h" #include "serial_tree_learner.h" @@ -32,7 +32,7 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con } else if (learner_type == std::string("voting")) { return new VotingParallelTreeLearner(config); } - } else if (device_type == std::string("cuda")) { // LGBM_CUDA + } else if (device_type == std::string("cuda")) { if (learner_type == std::string("serial")) { return new CUDATreeLearner(config); } else if (learner_type == std::string("feature")) { diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 265e94757aa..51ee2096380 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -454,7 +454,7 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, } // instantiate template classes, otherwise linker cannot find the code -template class VotingParallelTreeLearner; // LGBM_CUDA +template class VotingParallelTreeLearner; template class VotingParallelTreeLearner; template class VotingParallelTreeLearner; } // namespace LightGBM From ac5f7b8b8fde7f113d7d021d6e15e22e7fc0dca4 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 13 Jul 2020 19:38:22 +0000 Subject: [PATCH 088/119] Removal of FIXME comments and unused defines. --- src/treelearner/cuda_tree_learner.cpp | 4 +- src/treelearner/cuda_tree_learner.h | 1 - .../kernels/histogram_16_64_256.cu | 42 ------------------- .../kernels/histogram_16_64_256.hu | 20 --------- 4 files changed, 2 insertions(+), 65 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 7f32c23c398..0538b849e34 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -357,7 +357,7 @@ void CUDATreeLearner::AllocateGPUMemory() { if (num_gpu_feature_groups) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers) + // allocate memory for all features if ( device_features_[device_id] != NULL ) { CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id])); } @@ -814,7 +814,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( // if not all feature groups are used, we need to transfer the feature mask to GPU // otherwise, we will use a specialized GPU kernel with all feature groups enabled - // LGBM_CUDA FIXME: No waiting mark for feature mask + // LGBM_CUDA // LGBM_CUDA We now copy even if all features are used. diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 46e31985a5d..034a31f6d3a 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -243,7 +243,6 @@ class CUDATreeLearner: public SerialTreeLearner { // char *device_subhistograms_; std::vector device_subhistograms_; /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */ - // FIXME: is this cuda mapped // void *device_histogram_outputs_; std::vector device_histogram_outputs_; /*! \brief Host memory pointer for histogram outputs */ diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 23e9b150a1f..994a3c94170 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -105,7 +105,6 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, const size_t power_feature_workgroups) { #else __global__ void KERNEL_NAME(const uchar* feature_data_base, - // FIXME: how to handle this __constant const uchar* __restrict__ feature_masks, const data_size_t feature_size, const data_size_t* data_indices, @@ -293,18 +292,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { // even threads read gradients, odd threads read hessians - // FIXME: 2-way bank conflict acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } // write counts acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict uint value = cnt_hist[i]; ptr_i[i] = value; } - // FIXME: is this right __syncthreads(); __threadfence(); // To avoid the cost of an extra reducting kernel, we have to deal with some @@ -312,19 +308,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // make the final reduction, and other threads will just quit. // This requires that the results written by other workgroups available to the // last workgroup (memory consistency) - #if NVIDIA == 1 // this is equavalent to CUDA __threadfence(); // ensure the writes above goes to main memory and other workgroups can see it asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); - #else - // FIXME: how to do the above on AMD GPUs?? - // GCN ISA says that the all writes will bypass L1 cache (write through), - // however when the last thread is reading sub-histogram data we have to - // make sure that no part of data is modified in local L1 cache of other workgroups. - // Otherwise reading can be a problem (atomic operations to get consistency). - // But in our case, the sub-histogram of this workgroup cannot be in the cache - // of another workgroup, so the following trick will work just fine. - #endif // Now, we want one workgroup to do the final reduction. // Other workgroups processing the same feature quit. // The is done by using an global atomic counter. @@ -451,7 +437,6 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, const size_t power_feature_workgroups) { #else __global__ void KERNEL_NAME(const uchar* feature_data_base, - // FIXME: how to handle this __constant const uchar* __restrict__ feature_masks, const data_size_t feature_size, const data_size_t* data_indices, @@ -637,18 +622,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { // even threads read gradients, odd threads read hessians - // FIXME: 2-way bank conflict acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } // write counts acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict uint value = cnt_hist[i]; ptr_i[i] = value; } - // FIXME: is this right __syncthreads(); __threadfence(); // To avoid the cost of an extra reducting kernel, we have to deal with some @@ -656,19 +638,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // make the final reduction, and other threads will just quit. // This requires that the results written by other workgroups available to the // last workgroup (memory consistency) - #if NVIDIA == 1 // this is equavalent to CUDA __threadfence(); // ensure the writes above goes to main memory and other workgroups can see it asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); - #else - // FIXME: how to do the above on AMD GPUs?? - // GCN ISA says that the all writes will bypass L1 cache (write through), - // however when the last thread is reading sub-histogram data we have to - // make sure that no part of data is modified in local L1 cache of other workgroups. - // Otherwise reading can be a problem (atomic operations to get consistency). - // But in our case, the sub-histogram of this workgroup cannot be in the cache - // of another workgroup, so the following trick will work just fine. - #endif // Now, we want one workgroup to do the final reduction. // Other workgroups processing the same feature quit. // The is done by using an global atomic counter. @@ -796,7 +768,6 @@ __kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, const size_t power_feature_workgroups) { #else __global__ void KERNEL_NAME(const uchar* feature_data_base, - // FIXME: how to handle this __constant const uchar* __restrict__ feature_masks, const data_size_t feature_size, const data_size_t* data_indices, @@ -982,18 +953,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ ptr_f = output; for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { // even threads read gradients, odd threads read hessians - // FIXME: 2-way bank conflict acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } // write counts acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict uint value = cnt_hist[i]; ptr_i[i] = value; } - // FIXME: is this right __syncthreads(); __threadfence(); // To avoid the cost of an extra reducting kernel, we have to deal with some @@ -1001,19 +969,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // make the final reduction, and other threads will just quit. // This requires that the results written by other workgroups available to the // last workgroup (memory consistency) - #if NVIDIA == 1 // this is equavalent to CUDA __threadfence(); // ensure the writes above goes to main memory and other workgroups can see it asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); - #else - // FIXME: how to do the above on AMD GPUs?? - // GCN ISA says that the all writes will bypass L1 cache (write through), - // however when the last thread is reading sub-histogram data we have to - // make sure that no part of data is modified in local L1 cache of other workgroups. - // Otherwise reading can be a problem (atomic operations to get consistency). - // But in our case, the sub-histogram of this workgroup cannot be in the cache - // of another workgroup, so the following trick will work just fine. - #endif // Now, we want one workgroup to do the final reduction. // Other workgroups processing the same feature quit. // The is done by using an global atomic counter. diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu index 4dfcb9f7895..86400ae84e6 100644 --- a/src/treelearner/kernels/histogram_16_64_256.hu +++ b/src/treelearner/kernels/histogram_16_64_256.hu @@ -77,34 +77,14 @@ typedef uint acc_int_type; #define as_acc_int_type as_uint #endif -// unroll the atomic operation for a few times. Takes more code space, -// but compiler can generate better code for faster atomics. -#define UNROLL_ATOMIC 1 - -// Options passed by compiler at run time: -// IGNORE_INDICES will be set when the kernel does not -//#define IGNORE_INDICES -//#define POWER_FEATURE_WORKGROUPS 10 - -// detect Nvidia platforms -#ifdef cl_nv_pragma_unroll -#define NVIDIA 1 -#endif - // use all features and do not use feature mask #ifndef ENABLE_ALL_FEATURES #define ENABLE_ALL_FEATURES 1 #endif -// use binary patching for AMD GCN 1.2 or newer -#ifndef AMD_USE_DS_ADD_F32 -#define AMD_USE_DS_ADD_F32 0 -#endif - typedef uint data_size_t; typedef float score_t; - // define all of the different kernels #define DECLARE_CONST_BUF(name) \ From 63d75e978963d308b59eac0f2e7c02bbb256e891 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 13 Jul 2020 20:36:19 +0000 Subject: [PATCH 089/119] More reviewers comments cleanup. --- include/LightGBM/cuda/cuda_utils.h | 4 +- include/LightGBM/cuda/vector_cudahost.h | 4 +- src/treelearner/cuda_tree_learner.cpp | 115 ++++-------------- src/treelearner/cuda_tree_learner.h | 10 +- .../kernels/histogram_16_64_256.hu | 4 +- src/treelearner/serial_tree_learner.h | 8 +- 6 files changed, 35 insertions(+), 110 deletions(-) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index af787315559..3c0264cb396 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -2,8 +2,8 @@ * Copyright (c) 2020 IBM Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifndef LGBM_CUDA_UTILS_H -#define LGBM_CUDA_UTILS_H +#ifndef LIGHTGBM_CUDA_UTILS_H_ +#define LIGHTGBM_CUDA_UTILS_H_ #ifdef USE_CUDA diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index d73fabe25fa..a5d97370261 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -2,8 +2,8 @@ * Copyright (c) 2020 IBM Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifndef LGBM_CUDA_VECTOR_CH_H -#define LGBM_CUDA_VECTOR_CH_H +#ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ +#define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ #ifdef USE_CUDA #include diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 0538b849e34..4183019ed85 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -48,11 +48,6 @@ static void *launch_cuda_histogram(void *thread_data) { return NULL; } -/* -static void *wait_event(void *wait_obj) { - CUDASUCCESS_OR_FATAL(cudaEventSynchronize(*(cudaEvent_t *)wait_obj)); -}*/ - namespace LightGBM { CUDATreeLearner::CUDATreeLearner(const Config* config) @@ -60,9 +55,9 @@ CUDATreeLearner::CUDATreeLearner(const Config* config) use_bagging_ = false; nthreads_ = 0; if (config->gpu_use_dp && USE_DP_FLOAT) { - Log::Info("LightGBM-CUDA using CUDA trainer with DP float!!"); + Log::Info("LightGBM using CUDA trainer with DP float!!"); } else { - Log::Info("LightGBM-CUDA using CUDA trainer with SP float!!"); + Log::Info("LightGBM using CUDA trainer with SP float!!"); } } @@ -78,7 +73,7 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) num_feature_groups_ = train_data_->num_feature_groups(); // Initialize GPU buffers and kernels & LGBM_CUDA: get device info - InitGPU(config_->num_gpu); // LGBM_CUDA + InitGPU(config_->num_gpu); } // some functions used for debugging the GPU histogram construction @@ -192,7 +187,6 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu // set work group size based on feature size // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple - int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data); std::vector num_gpu_workgroups; ThreadData *thread_data = reinterpret_cast(malloc(sizeof(ThreadData) * num_gpu_)); @@ -245,11 +239,8 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra #pragma omp parallel for schedule(static, num_gpu_) for (int device_id = 0; device_id < num_gpu_; ++device_id) { - // auto start_time = std::chrono::steady_clock::now(); - // when the output is ready, the computation is done CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id])); - // LGBM_CUDA } HistType* histograms = reinterpret_cast(leaf_histogram_array[0].RawData() - kHistOffset); @@ -269,7 +260,6 @@ void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_arra } } -// LGBM_CUDA void CUDATreeLearner::CountDenseFeatureGroups() { num_dense_feature_groups_ = 0; @@ -283,7 +273,6 @@ void CUDATreeLearner::CountDenseFeatureGroups() { } } -// LGBM_CUDA void CUDATreeLearner::prevAllocateGPUMemory() { // how many feature-group tuples we have // leave some safe margin for prefetching @@ -292,7 +281,6 @@ void CUDATreeLearner::prevAllocateGPUMemory() { allocated_num_data_ = std::max(num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature), allocated_num_data_); // clear sparse/dense maps - dense_feature_group_map_.clear(); sparse_feature_group_map_.clear(); @@ -315,18 +303,6 @@ void CUDATreeLearner::prevAllocateGPUMemory() { offset += num_gpu_feature_groups_.at(i); } -#if 0 - // allocate feature mask, for disabling some feature-groups' histogram calculation - if (feature_masks_.data() != NULL) { - cudaPointerAttributes attributes; - cudaPointerGetAttributes(&attributes, feature_masks_.data()); - - if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { - CUDASUCCESS_OR_FATAL(cudaHostUnregister(feature_masks_.data())); - } - } -#endif - feature_masks_.resize(num_dense_feature_groups_); Log::Debug("Resized feature masks"); @@ -337,12 +313,8 @@ void CUDATreeLearner::prevAllocateGPUMemory() { // histogram bin entry size depends on the precision (single/double) hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t)); // two elements in this "size" - // host_size histogram outputs - // host_histogram_outputs_ = malloc(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_); - CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast(&host_histogram_outputs_), (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable)); - // LGBM_CUDA nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_); nthreads_ = std::max(nthreads_, 1); } @@ -367,7 +339,6 @@ void CUDATreeLearner::AllocateGPUMemory() { // allocate space for gradients and hessians on device // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed - if (device_gradients_[device_id] != NULL) { CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id])); } @@ -386,10 +357,9 @@ void CUDATreeLearner::AllocateGPUMemory() { CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), (size_t) num_gpu_feature_groups)); // copy indices to the device - - if (device_data_indices_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); - } + if (device_data_indices_[device_id] != NULL) { + CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); + } CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), (size_t) allocated_num_data_ * sizeof(data_size_t))); CUDASUCCESS_OR_FATAL(cudaMemsetAsync(device_data_indices_[device_id], 0, allocated_num_data_ * sizeof(data_size_t), stream_[device_id])); @@ -398,7 +368,6 @@ void CUDATreeLearner::AllocateGPUMemory() { // create output buffer, each feature has a histogram with device_bin_size_ bins, // each work group generates a sub-histogram of dword_features_ features. - if (!device_subhistograms_[device_id]) { // only initialize once here, as this will not need to change when ResetTrainingData() is called CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))); @@ -424,14 +393,12 @@ void CUDATreeLearner::ResetGPUMemory() { sparse_feature_group_map_.clear(); } -// LGBM_CUDA void CUDATreeLearner::copyDenseFeature() { if (num_feature_groups_ == 0) { LGBM_config_::current_learner = use_cpu_learner; return; } -// auto start_time = std::chrono::steady_clock::now(); Log::Debug("Started copying dense features from CPU to GPU"); // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes) size_t copied_feature = 0; @@ -465,9 +432,6 @@ void CUDATreeLearner::copyDenseFeature() { sparse_feature_group_map_.push_back(i); } } - - // data transfer time // LGBM_CUDA: async copy, so it is not the real data transfer time - // std::chrono::duration end_time = std::chrono::steady_clock::now() - start_time; } @@ -495,18 +459,18 @@ void CUDATreeLearner::InitGPU(int num_gpu) { #endif if (max_num_bin_ <= 16) { - device_bin_size_ = 16; // LGBM_CUDA + device_bin_size_ = 16; histogram_size_ = 16; - dword_features_ = 1; // LGBM_CUDA + dword_features_ = 1; } else if (max_num_bin_ <= 64) { - device_bin_size_ = 64; // LGBM_CUDA + device_bin_size_ = 64; histogram_size_ = 64; - dword_features_ = 1; // LGBM_CUDA + dword_features_ = 1; } else if (max_num_bin_ <= 256) { Log::Debug("device_bin_size_ = 256"); device_bin_size_ = 256; histogram_size_ = 256; - dword_features_ = 1; // LGBM_CUDA + dword_features_ = 1; } else { Log::Fatal("bin size %d cannot run on GPU", max_num_bin_); } @@ -555,11 +519,6 @@ void CUDATreeLearner::InitGPU(int num_gpu) { kernel_wait_obj_.resize(num_gpu_); histograms_wait_obj_.resize(num_gpu_); - // for debuging - kernel_time_.resize(num_gpu_, 0); - kernel_input_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); - // kernel_output_wait_time_.resize(num_gpu_, std::chrono::milliseconds(0)); - for (int i = 0; i < num_gpu_; ++i) { CUDASUCCESS_OR_FATAL(cudaSetDevice(i)); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&(stream_[i]))); @@ -577,7 +536,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { AllocateGPUMemory(); - copyDenseFeature(); // LGBM_CUDA + copyDenseFeature(); } Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians) { @@ -591,20 +550,18 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_ SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin); - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + #if ResetTrainingData_DEBUG == 1 serial_time = std::chrono::steady_clock::now() - start_serial_time; #endif num_feature_groups_ = train_data_->num_feature_groups(); // GPU memory has to been reallocated because data may have been changed - - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + #if ResetTrainingData_DEBUG == 1 auto start_alloc_gpu_time = std::chrono::steady_clock::now(); #endif // LGBM_CUDA: AllocateGPUMemory only when the number of data increased - int old_num_feature_groups = num_dense_feature_groups_; CountDenseFeatureGroups(); if ((old_allocated_num_data < (num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature))) || (old_num_feature_groups < num_dense_feature_groups_)) { @@ -616,17 +573,16 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_ copyDenseFeature(); - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + #if ResetTrainingData_DEBUG == 1 alloc_gpu_time = std::chrono::steady_clock::now() - start_alloc_gpu_time; #endif // setup GPU kernel arguments after we allocating all the buffers - - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + #if ResetTrainingData_DEBUG == 1 auto start_set_arg_time = std::chrono::steady_clock::now(); #endif - #if ResetTrainingData_DEBUG == 1 // LGBM_CUDA + #if ResetTrainingData_DEBUG == 1 set_arg_time = std::chrono::steady_clock::now() - start_set_arg_time; reset_training_data_time = std::chrono::steady_clock::now() - start_reset_training_data_time; Log::Info("reset_training_data_time: %f secs.", reset_training_data_time.count() * 1e-3); @@ -637,7 +593,7 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_ } void CUDATreeLearner::BeforeTrain() { - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + #if cudaMemcpy_DEBUG == 1 std::chrono::duration device_hessians_time = std::chrono::milliseconds(0); std::chrono::duration device_gradients_time = std::chrono::milliseconds(0); #endif @@ -650,7 +606,6 @@ void CUDATreeLearner::BeforeTrain() { // Copy initial full hessians and gradients to GPU. // We start copying as early as possible, instead of at ConstructHistogram(). - if ((hessians_ != NULL) && (gradients_ != NULL)) { if (!use_bagging_ && num_dense_feature_groups_) { Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_); @@ -659,32 +614,29 @@ void CUDATreeLearner::BeforeTrain() { if (!(share_state_->is_constant_hessian)) { Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_"); - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + #if cudaMemcpy_DEBUG == 1 auto start_device_hessians_time = std::chrono::steady_clock::now(); #endif - // const data_size_t* indices = data_partition_->indices(); - // data_size_t cnt = data_partition_->leaf_count(0); - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + #if cudaMemcpy_DEBUG == 1 device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time; #endif Log::Debug("queued copy of device_hessians_"); } - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + #if cudaMemcpy_DEBUG == 1 auto start_device_gradients_time = std::chrono::steady_clock::now(); #endif CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); - #if cudaMemcpy_DEBUG == 1 // LGBM_CUDA + #if cudaMemcpy_DEBUG == 1 device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time; #endif @@ -693,16 +645,11 @@ void CUDATreeLearner::BeforeTrain() { } } -#if 0 - SerialTreeLearner::BeforeTrain(); -#endif - // use bagging if ((hessians_ != NULL) && (gradients_ != NULL)) { if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) { // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram() // copy used gradients and hessians to ordered buffer - const data_size_t* indices = data_partition_->indices(); data_size_t cnt = data_partition_->leaf_count(0); @@ -747,10 +694,6 @@ bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int r data_size_t begin = data_partition_->leaf_begin(smaller_leaf); data_size_t end = begin + data_partition_->leaf_count(smaller_leaf); - // copy indices to the GPU: - #if GPU_DEBUG >= 2 - #endif - for (int device_id = 0; device_id < num_gpu_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], &indices[begin], (end-begin) * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id])); CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); @@ -789,8 +732,8 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) for (int i = 0; i < num_features_; ++i) { if (is_feature_used[i]) { - int feature_group = train_data_->Feature2Group(i); // LGBM_CUDA - is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group) <= 16) ? 2 : 1; // LGBM_CUDA + int feature_group = train_data_->Feature2Group(i); + is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group) <= 16) ? 2 : 1; } } @@ -814,10 +757,8 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( // if not all feature groups are used, we need to transfer the feature mask to GPU // otherwise, we will use a specialized GPU kernel with all feature groups enabled - // LGBM_CUDA // LGBM_CUDA We now copy even if all features are used. - #pragma omp parallel for schedule(static, num_gpu_) for (int device_id = 0; device_id < num_gpu_; ++device_id) { int offset = offset_gpu_feature_groups_[device_id]; @@ -825,16 +766,12 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( } // All data have been prepared, now run the GPU kernel - GPUHistogram(num_data, use_all_features); return true; } void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) { - // LGBM_CUDA - // auto start_time = std::chrono::steady_clock::now(); - std::vector is_sparse_feature_used(num_features_, 0); std::vector is_dense_feature_used(num_features_, 0); int num_dense_features = 0, num_sparse_features = 0; @@ -958,12 +895,10 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); delete [] gpu_histogram; - // break; // LGBM_CUDA: see only first feature info } printf("End Comparing Histogram between GPU and CPU\n"); fflush(stderr); fflush(stdout); -// #endif #endif if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { @@ -976,7 +911,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ // then construct sparse features on CPU // We set data_indices to null to avoid rebuilding ordered gradients/hessians - if (num_sparse_features > 0) { train_data_->ConstructHistograms(is_sparse_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), @@ -987,7 +921,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } // wait for GPU to finish, only if GPU is actually used - if (is_gpu_used) { if (config_->gpu_use_dp) { // use double precision diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 034a31f6d3a..7385d2af5dc 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -3,8 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #pragma once -#ifndef LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ -#define LGBM_TREELEARNET_CUDA_TREE_LEARNER_H_ +#ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ +#define LIGHTGBM_TREELEARNET_CUDA_TREE_LEARNER_H_ #include #include @@ -64,10 +64,6 @@ class CUDATreeLearner: public SerialTreeLearner { void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; private: - /*! \brief 4-byte feature tuple used by GPU kernels */ - // struct Feature4 { - // uint8_t s[4]; - // }; typedef float gpu_hist_t; /*! @@ -297,4 +293,4 @@ class CUDATreeLearner: public SerialTreeLearner { } // namespace LightGBM #endif // USE_CUDA -#endif // LGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ +#endif // LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu index 86400ae84e6..a1c2744c624 100644 --- a/src/treelearner/kernels/histogram_16_64_256.hu +++ b/src/treelearner/kernels/histogram_16_64_256.hu @@ -12,8 +12,8 @@ * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. */ -#ifndef _HISTOGRAM_16_64_256_KERNEL_ -#define _HISTOGRAM_16_64_256_KERNEL_ +#ifndef LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_ +#define LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_ //#pragma once diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 79882ded79e..fc1de33e365 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -26,11 +27,6 @@ #include "monotone_constraints.hpp" #include "split_info.hpp" -// LGBM_CUDA -#ifdef USE_CUDA -#include -#endif - #ifdef USE_GPU // Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled. // This is necessary to pin the two arrays in memory and make transferring faster. @@ -206,7 +202,7 @@ class SerialTreeLearner: public TreeLearner { std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */ std::vector> ordered_hessians_; -#elif USE_CUDA // LGBM_CUDA +#elif USE_CUDA /*! \brief gradients of current iteration, ordered for cache optimized */ std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized */ From 6fee44a9dfa63f24ca2c7db0fa967e0217f6951f Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 13 Jul 2020 20:39:11 +0000 Subject: [PATCH 090/119] More reviewers comments cleanup. --- src/treelearner/cuda_kernel_launcher.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h index 57c5f1bfc26..6b6b7cb4b01 100644 --- a/src/treelearner/cuda_kernel_launcher.h +++ b/src/treelearner/cuda_kernel_launcher.h @@ -2,11 +2,10 @@ * Copyright (c) 2020 IBM Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifndef LGBM_KERNEL_LAUNCHER -#define LGBM_KERNEL_LAUNCHER +#ifndef LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_ +#define LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_ #ifdef USE_CUDA -// what should I include?? #include #include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t @@ -67,4 +66,4 @@ void cuda_histogram( #endif // USE_CUDA -#endif // LGBM_KERNEL_LAUNCHER +#endif // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_ From cc41446a8eccdf496a41904a2ea11839f6e10452 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 14 Jul 2020 13:19:14 +0000 Subject: [PATCH 091/119] More reviewers comments cleanup. --- src/treelearner/cuda_tree_learner.cpp | 7 ------- src/treelearner/cuda_tree_learner.h | 2 -- src/treelearner/kernels/histogram_16_64_256.cu | 13 ++----------- src/treelearner/kernels/histogram_16_64_256.hu | 13 ++----------- 4 files changed, 4 insertions(+), 31 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 4183019ed85..7a0a5d59093 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -163,7 +163,6 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples. // also guarantee that there are at least 2K examples per workgroup - double x = 256.0 / num_dense_feature_groups_; int exp_workgroups_per_feature = static_cast(ceil(log2(x))); @@ -186,7 +185,6 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu // decide the best number of workgroups working on one feature4 tuple // set work group size based on feature size // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple - int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data); std::vector num_gpu_workgroups; ThreadData *thread_data = reinterpret_cast(malloc(sizeof(ThreadData) * num_gpu_)); @@ -213,7 +211,6 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu } /* Wait for the threads to finish */ - for (int device_id = 0; device_id < num_gpu_; ++device_id) { if (pthread_join(*(cpu_threads_[device_id]), NULL)) { fprintf(stderr, "Error in joining threads. Exiting\n"); @@ -439,7 +436,6 @@ void CUDATreeLearner::copyDenseFeature() { // LGBM_CUDA: InitGPU w/ num_gpu void CUDATreeLearner::InitGPU(int num_gpu) { // Get the max bin size, used for selecting best GPU kernel - max_num_bin_ = 0; #if GPU_DEBUG >= 1 @@ -676,7 +672,6 @@ bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int r data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf); data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf); - // only have root if (right_leaf < 0) { smaller_leaf = -1; @@ -742,7 +737,6 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048) for (int i = 0; i < num_dense_feature_groups_; ++i) { if (is_feature_group_used[dense_feature_group_map_[i]]) { - // feature_masks_[i] = 1; feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]]; ++used_dense_feature_groups; } else { @@ -903,7 +897,6 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { // construct larger leaf - hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset; is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 7385d2af5dc..ca063765b19 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -175,8 +175,6 @@ class CUDATreeLearner: public SerialTreeLearner { /*! \brief True if bagging is used */ bool use_bagging_; - /*! \brief GPU device object */ - // int* dev_; /*! \brief GPU command queue object */ std::vector stream_; diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 994a3c94170..5a7fe5245b6 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -1,15 +1,6 @@ /*! - * ibmGBT: IBM CUDA Accelerated LightGBM - * - * IBM Confidential - * Copyright (c) 2019 IBM Corporation. All rights reserved. - * - * The source code for this program is not published or otherwise - * divested of its trade secrets, irrespective of what has been - * deposited with the U.S. Copyright Office. - * - * US Government Users Restricted Rights - Use, duplication or - * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include "histogram_16_64_256.hu" diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu index a1c2744c624..e228d3b0068 100644 --- a/src/treelearner/kernels/histogram_16_64_256.hu +++ b/src/treelearner/kernels/histogram_16_64_256.hu @@ -1,15 +1,6 @@ /*! - * ibmGBT: IBM CUDA Accelerated LightGBM - * - * IBM Confidential - * Copyright (c) 2019 IBM Corporation. All rights reserved. - * - * The source code for this program is not published or otherwise - * divested of its trade secrets, irrespective of what has been - * deposited with the U.S. Copyright Office. - * - * US Government Users Restricted Rights - Use, duplication or - * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #ifndef LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_ From bab89cfbde0ffcf13992874f79705d141efe30df Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 14 Jul 2020 17:30:34 +0000 Subject: [PATCH 092/119] Fix config variables. --- include/LightGBM/config.h | 1 + src/io/config.cpp | 5 +++++ src/io/config_auto.cpp | 4 +--- src/treelearner/cuda_tree_learner.h | 1 - 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 36219675da5..fff98c437f1 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -954,6 +954,7 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) bool gpu_use_dp = false; + // check = >0 // desc = number of gpus (CUDA implementation only) // desc = default value is 1 int num_gpu = 1; diff --git a/src/io/config.cpp b/src/io/config.cpp index b354bf10f03..4c65d158800 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -333,6 +333,11 @@ void Config::CheckParamConflict() { force_row_wise = false; } + // force gpu_use_dp for CUDA + if (device_type == std::string("cuda")) { + gpu_use_dp = true; + } + // min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated // the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can // be 1 when there is actually no data in the leaf. In rare cases this can cause a bug because with path smoothing the diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index d0bb97e1942..b35e8da49ab 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -606,9 +606,6 @@ void Config::GetMembersFromString(const std::unordered_map preallocd_max_num_wg_; /*! \brief True if bagging is used */ From ea96902b8dccbca957c8b53a9cdead1a1c6e5ae0 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 17 Jul 2020 14:53:30 +0000 Subject: [PATCH 093/119] Attempt to fix check-docs failure --- include/LightGBM/config.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index fff98c437f1..c24e9613a95 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -956,7 +956,6 @@ struct Config { // check = >0 // desc = number of gpus (CUDA implementation only) - // desc = default value is 1 int num_gpu = 1; #pragma endregion From 12a9fe50c0156e37fee7a9bb6f23a6883b12649e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 17 Jul 2020 15:39:03 +0000 Subject: [PATCH 094/119] Update Paramster.rst for num_gpu --- docs/Parameters.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 01362fb9af3..5bd392d82f8 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -1104,6 +1104,10 @@ GPU Parameters - set this to ``true`` to use double precision math on GPU (by default single precision is used) +- ``num_gpu`` :raw-html:`🔗︎`, default = ``1``, type = int, constraints: ``num_gpu > 0`` + + - number of gpus (CUDA implementation only) + .. end params list Others From d712538d59880502922a939c6cf9187134b8e018 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 20 Jul 2020 12:00:20 +0000 Subject: [PATCH 095/119] Removing test appveyor.yml --- appveyor.yml | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 appveyor.yml diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index b4c0131a9af..00000000000 --- a/appveyor.yml +++ /dev/null @@ -1,50 +0,0 @@ -version: 2.3.2.{build} - -image: Visual Studio 2015 -platform: x64 -configuration: # a trick to construct a build matrix with multiple Python versions - - 3.7 - -# only build pull requests and -# commits to 'cuda' -branches: - only: - - cuda - -environment: - matrix: - - COMPILER: MSVC - TASK: python - - COMPILER: MINGW - TASK: python - -clone_depth: 5 - -init: - - ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) - -install: - - git submodule update --init --recursive # get `compute` folder - - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% # delete sh.exe from PATH (mingw32-make fix) - - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH% - - set PYTHON_VERSION=%CONFIGURATION% - - set CONDA_ENV="test-env" - - ps: >- - switch ($env:PYTHON_VERSION) { - "2.7" {$env:MINICONDA = "C:\Miniconda-x64"} - "3.5" {$env:MINICONDA = "C:\Miniconda35-x64"} - "3.6" {$env:MINICONDA = "C:\Miniconda36-x64"} - "3.7" {$env:MINICONDA = "C:\Miniconda37-x64"} - default {$env:MINICONDA = "C:\Miniconda37-x64"} - } - $env:PATH="$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH" - - ps: $env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim() - -build: false - -test_script: - - conda init powershell - - powershell.exe -ExecutionPolicy Bypass -File %APPVEYOR_BUILD_FOLDER%\.ci\test_windows.ps1 - -on_finish: - - ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) From 26c4dce683c773abc8749b2978cbc3c9d1f03e3c Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 20 Jul 2020 22:10:36 +0000 Subject: [PATCH 096/119] =?UTF-8?q?Add=20=C2=83CUDA=5FRESOLVE=5FDEVICE=5FS?= =?UTF-8?q?YMBOLS=20to=20libraries=20to=20fix=20linking=20issue.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index aae29463f94..f5ca69b4c9c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -381,10 +381,12 @@ if(USE_GPU) endif(USE_GPU) if(USE_CUDA) + set_property(TARGET lightgbm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) TARGET_LINK_LIBRARIES( lightgbm ${histograms} ) + set_property(TARGET _lightgbm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) TARGET_LINK_LIBRARIES( _lightgbm ${histograms} From 70b4bbb9c561077ac3f0fb6d6a09ac22350ff0c3 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 21 Jul 2020 16:29:44 +0000 Subject: [PATCH 097/119] Fixed handling of data elements less than 2K. --- src/application/application.cpp | 3 --- src/c_api.cpp | 10 ---------- src/treelearner/cuda_tree_learner.cpp | 10 +++++----- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/src/application/application.cpp b/src/application/application.cpp index e88f5c86188..2d3facdb978 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -43,10 +43,7 @@ Application::Application(int argc, char** argv) { #ifdef USE_CUDA if (config_.device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; - config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config_.bagging_fraction == 1.0) { config_.bagging_fraction = 0.8; } - if (config_.bagging_freq == 0) { config_.bagging_freq = 1; } } #endif } diff --git a/src/c_api.cpp b/src/c_api.cpp index 050e605b268..caa9ed577fd 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -44,10 +44,7 @@ inline void AdditionalConfig(Config *config) { #ifdef USE_CUDA if (config->device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; - config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - if (config->bagging_fraction == 1.0) { config->bagging_fraction = 0.8; } - if (config->bagging_freq == 0) { config->bagging_freq = 1; } } #else (void)(config); // UNUSED @@ -126,13 +123,6 @@ class Booster { omp_set_num_threads(config_.num_threads); } -#ifdef USE_CUDA - // Only use CUDA when the data is large (2048 == 256 bins each with at least 8 elements) - if (train_data->num_data() < 2048) { - config_.device_type = std::string("cpu"); - } -#endif - AdditionalConfig(&config_); // create boosting diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 7a0a5d59093..9bc140f1021 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -104,7 +104,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, static_cast(size)); if (dp_flag) { // double precision double af, bf; - int64 ai, bi; + int64_t ai, bi; for (i = 0; i < static_cast(size); ++i) { af = GET_GRAD(h1, i); bf = GET_GRAD(h2, i); @@ -113,10 +113,10 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d ++retval; } if (const_flag) { - ai = GET_HESS((reinterpret_cast(h1), i); - bi = GET_HESS((reinterpret_cast(h2), i); + ai = GET_HESS((reinterpret_cast(h1)), i); + bi = GET_HESS((reinterpret_cast(h2)), i); if (ai != bi) { - printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, ai, bi); + printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, (long long int) ai, (long long int) bi); ++retval; } } else { @@ -743,7 +743,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( feature_masks_[i] = 0; } } - bool use_all_features = used_dense_feature_groups == num_dense_feature_groups_; + bool use_all_features = ((used_dense_feature_groups == num_dense_feature_groups_) && (data_indices != nullptr)); // if no feature group is used, just return and do not use GPU if (used_dense_feature_groups == 0) { return false; From e7f45f5ce0582454a2ed58a0dfcc35dc652a97fd Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 21 Jul 2020 17:15:27 +0000 Subject: [PATCH 098/119] More reviewers comments cleanup. --- .../kernels/histogram_16_64_256.cu | 39 ------------------- 1 file changed, 39 deletions(-) diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 5a7fe5245b6..e6fceeb8cd4 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -294,19 +294,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } __syncthreads(); __threadfence(); - // To avoid the cost of an extra reducting kernel, we have to deal with some - // gray area in OpenCL. We want the last work group that process this feature to - // make the final reduction, and other threads will just quit. - // This requires that the results written by other workgroups available to the - // last workgroup (memory consistency) - // this is equavalent to CUDA __threadfence(); - // ensure the writes above goes to main memory and other workgroups can see it - asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); - // Now, we want one workgroup to do the final reduction. - // Other workgroups processing the same feature quit. - // The is done by using an global atomic counter. - // On AMD GPUs ideally this should be done in GDS, - // but currently there is no easy way to access it via OpenCL. uint * counter_val = cnt_hist; // backup the old value uint old_val = *counter_val; @@ -624,19 +611,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } __syncthreads(); __threadfence(); - // To avoid the cost of an extra reducting kernel, we have to deal with some - // gray area in OpenCL. We want the last work group that process this feature to - // make the final reduction, and other threads will just quit. - // This requires that the results written by other workgroups available to the - // last workgroup (memory consistency) - // this is equavalent to CUDA __threadfence(); - // ensure the writes above goes to main memory and other workgroups can see it - asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); - // Now, we want one workgroup to do the final reduction. - // Other workgroups processing the same feature quit. - // The is done by using an global atomic counter. - // On AMD GPUs ideally this should be done in GDS, - // but currently there is no easy way to access it via OpenCL. uint * counter_val = cnt_hist; // backup the old value uint old_val = *counter_val; @@ -955,19 +929,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } __syncthreads(); __threadfence(); - // To avoid the cost of an extra reducting kernel, we have to deal with some - // gray area in OpenCL. We want the last work group that process this feature to - // make the final reduction, and other threads will just quit. - // This requires that the results written by other workgroups available to the - // last workgroup (memory consistency) - // this is equavalent to CUDA __threadfence(); - // ensure the writes above goes to main memory and other workgroups can see it - asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); - // Now, we want one workgroup to do the final reduction. - // Other workgroups processing the same feature quit. - // The is done by using an global atomic counter. - // On AMD GPUs ideally this should be done in GDS, - // but currently there is no easy way to access it via OpenCL. uint * counter_val = cnt_hist; // backup the old value uint old_val = *counter_val; From 282731cbc7497da848ba3cc70197a08c482abbf8 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 22 Jul 2020 12:47:20 +0000 Subject: [PATCH 099/119] Removal of TODO and fix printing of int64_t --- src/treelearner/cuda_tree_learner.cpp | 3 ++- src/treelearner/kernels/histogram_16_64_256.cu | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 9bc140f1021..f4387f7c6a8 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -15,6 +15,7 @@ #include #include +#include #include "../io/dense_bin.hpp" @@ -116,7 +117,7 @@ int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int d ai = GET_HESS((reinterpret_cast(h1)), i); bi = GET_HESS((reinterpret_cast(h2)), i); if (ai != bi) { - printf("i = %5d, h1.hess %lld, h2.hess %lld\n", i, (long long int) ai, (long long int) bi); + printf("i = %5d, h1.hess %" PRId64 ", h2.hess %" PRId64 "\n", i, ai, bi); ++retval; } } else { diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index e6fceeb8cd4..09d563cbaf4 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -41,7 +41,6 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { const ushort ltid = threadIdx.x; - // TODO(anyone): try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; uint* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); @@ -360,7 +359,6 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { const ushort ltid = threadIdx.x; - // TODO(anyone): try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; uint* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); @@ -677,7 +675,6 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { const ushort ltid = threadIdx.x; - // TODO(anyone): try to avoid bank conflict here acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; uint* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); From 6103a87817cb4e671a22feb6c8eb315ae8a2df76 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 22 Jul 2020 16:07:46 +0000 Subject: [PATCH 100/119] Add cuda change for CI testing and remove cuda from device_type in python. --- .ci/test.sh | 10 +++ include/LightGBM/c_api.h | 6 -- python-package/lightgbm/__init__.py | 3 +- python-package/lightgbm/basic.py | 5 -- src/c_api.cpp | 10 --- tests/python_package_test/test_basic.py | 2 - tests/python_package_test/test_consistency.py | 4 - tests/python_package_test/test_engine.py | 85 ------------------- tests/python_package_test/test_plotting.py | 2 - tests/python_package_test/test_sklearn.py | 6 -- 10 files changed, 11 insertions(+), 122 deletions(-) diff --git a/.ci/test.sh b/.ci/test.sh index 7c68ca733fe..c12e019ccb7 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -132,6 +132,16 @@ if [[ $TASK == "gpu" ]]; then exit 0 fi cmake -DUSE_GPU=ON -DOpenCL_INCLUDE_DIR=$AMDAPPSDK_PATH/include/ .. +elif [[ $TASK == "cuda" ]]; then + sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h + grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done + if [[ $METHOD == "pip" ]]; then + cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 + pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1 + pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1 + exit 0 + fi + cmake -DUSE_CUDA=ON .. elif [[ $TASK == "mpi" ]]; then if [[ $METHOD == "pip" ]]; then cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 3fbccdac075..9d7c6e61dd2 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -1076,12 +1076,6 @@ LIGHTGBM_C_EXPORT int LGBM_NetworkInitWithFunctions(int num_machines, #define THREAD_LOCAL thread_local /*!< \brief Thread local specifier. */ #endif -/*! - * * \brief Returns device type. - * * \return 0 = CPU, 1 = GPU / OCL, 2 = CUDA - * */ -LIGHTGBM_C_EXPORT int LGBM_GetDeviceType(); - /*! * \brief Handle of error message. * \return Error message diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index 44a56ae03f5..390a6994a7a 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -5,7 +5,7 @@ """ from __future__ import absolute_import -from .basic import Booster, Dataset, get_device_type +from .basic import Booster, Dataset from .callback import (early_stopping, print_evaluation, record_evaluation, reset_parameter) from .engine import cv, train @@ -30,7 +30,6 @@ __version__ = version_file.read().strip() __all__ = ['Dataset', 'Booster', - 'get_device_type', 'train', 'cv', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 9dace6b768c..01a5f31e51b 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -432,11 +432,6 @@ def _load_pandas_categorical(file_name=None, model_str=None): return None -def get_device_type(): - """Get device type.""" - return _LIB.LGBM_GetDeviceType() - - class _InnerPredictor(object): """_InnerPredictor of LightGBM. diff --git a/src/c_api.cpp b/src/c_api.cpp index caa9ed577fd..84d0a25ab08 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -627,16 +627,6 @@ const char* LGBM_GetLastError() { return LastErrorMsg(); } -int LGBM_GetDeviceType() { -#ifdef USE_GPU - return 1; -#elif USE_CUDA - return 2; -#else - return 0; // CPU -#endif -} - int LGBM_RegisterLogCallback(void (*callback)(const char*)) { API_BEGIN(); Log::ResetCallBack(callback); diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index d984c25f65f..85e9e728d70 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -29,8 +29,6 @@ def test(self): "max_bin": 255, "gpu_use_dp": True } - if lgb.get_device_type() == 2: - params["device"] = "cuda" bst = lgb.Booster(params, train_data) bst.add_valid(valid_data, "valid_1") diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py index f6e955ee48d..63a5834cf61 100644 --- a/tests/python_package_test/test_consistency.py +++ b/tests/python_package_test/test_consistency.py @@ -68,8 +68,6 @@ class TestEngine(unittest.TestCase): def test_binary(self): fd = FileLoader('../../examples/binary_classification', 'binary') - if lgb.get_device_type() == 2: - fd.params["device"] = "cuda" X_train, y_train, _ = fd.load_dataset('.train') X_test, _, X_test_fn = fd.load_dataset('.test') weight_train = fd.load_field('.train.weight') @@ -93,8 +91,6 @@ def test_multiclass(self): def test_regression(self): fd = FileLoader('../../examples/regression', 'regression') - if lgb.get_device_type() == 2: - fd.params["device"] = "cuda" X_train, y_train, _ = fd.load_dataset('.train') X_test, _, X_test_fn = fd.load_dataset('.test') init_score_train = fd.load_field('.train.init') diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index b5de6a9a4c7..286bd2e2a8d 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -61,8 +61,6 @@ def test_binary(self): 'verbose': -1, 'num_iteration': 50 # test num_iteration in dict here } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) evals_result = {} @@ -89,8 +87,6 @@ def test_rf(self): 'metric': 'binary_logloss', 'verbose': -1 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) evals_result = {} @@ -110,8 +106,6 @@ def test_regression(self): 'metric': 'l2', 'verbose': -1 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) evals_result = {} @@ -139,8 +133,6 @@ def test_missing_value_handle(self): 'verbose': -1, 'boost_from_average': False } - if lgb.get_device_type() == 2: - params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=20, @@ -196,8 +188,6 @@ def test_missing_value_handle_na(self): 'min_data_in_bin': 1, 'zero_as_missing': False } - if lgb.get_device_type() == 2: - params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -230,8 +220,6 @@ def test_missing_value_handle_zero(self): 'min_data_in_bin': 1, 'zero_as_missing': True } - if lgb.get_device_type() == 2: - params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -264,8 +252,6 @@ def test_missing_value_handle_none(self): 'min_data_in_bin': 1, 'use_missing': False } - if lgb.get_device_type() == 2: - params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -304,8 +290,6 @@ def test_categorical_handle(self): 'zero_as_missing': True, 'categorical_column': 0 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -343,8 +327,6 @@ def test_categorical_handle_na(self): 'zero_as_missing': False, 'categorical_column': 0 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=1, @@ -403,8 +385,6 @@ def test_multiclass(self): 'num_class': 10, 'verbose': -1 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) evals_result = {} @@ -421,7 +401,6 @@ def test_multiclass_rf(self): X, y = load_digits(10, True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { - 'device': 'cpu', 'boosting_type': 'rf', 'objective': 'multiclass', 'metric': 'multi_logloss', @@ -455,8 +434,6 @@ def test_multiclass_prediction_early_stopping(self): 'num_class': 10, 'verbose': -1 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, params=params) gbm = lgb.train(params, lgb_train, num_boost_round=50) @@ -478,8 +455,6 @@ def test_multi_class_error(self): X, y = load_digits(10, True) params = {'objective': 'multiclass', 'num_classes': 10, 'metric': 'multi_error', 'num_leaves': 4, 'verbose': -1} - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=10) predict_default = est.predict(X) @@ -589,8 +564,6 @@ def test_early_stopping(self): 'metric': 'binary_logloss', 'verbose': -1 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) @@ -624,8 +597,6 @@ def test_continue_train(self): 'metric': 'l1', 'verbose': -1 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False) init_gbm = lgb.train(params, lgb_train, num_boost_round=20) @@ -691,8 +662,6 @@ def test_continue_train_multiclass(self): 'num_class': 3, 'verbose': -1 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, params=params, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params, free_raw_data=False) init_gbm = lgb.train(params, lgb_train, num_boost_round=20) @@ -749,8 +718,6 @@ def test_cv(self): q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3} - if lgb.get_device_type() == 2: - params_lambdarank["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train, group=q_train) # ... with l2 metric cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, @@ -804,8 +771,6 @@ def train_and_predict(init_model=None, return_model=False): 'metric': 'l2', 'verbose': -1 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model) return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test)) @@ -859,8 +824,6 @@ def test_pandas_categorical(self): 'metric': 'binary_logloss', 'verbose': -1 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X, y) gbm0 = lgb.train(params, lgb_train, num_boost_round=10) pred0 = gbm0.predict(X_test) @@ -957,8 +920,6 @@ def test_reference_chain(self): tmp_dat_train = tmp_dat.subset(np.arange(80)) tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18)) params = {'objective': 'regression_l2', 'metric': 'rmse'} - if lgb.get_device_type() == 2: - params["device"] = "cuda" evals_result = {} gbm = lgb.train(params, tmp_dat_train, num_boost_round=20, valid_sets=[tmp_dat_train, tmp_dat_val], @@ -974,8 +935,6 @@ def test_contribs(self): 'metric': 'binary_logloss', 'verbose': -1, } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X_train, y_train) gbm = lgb.train(params, lgb_train, num_boost_round=20) @@ -990,8 +949,6 @@ def train_and_get_predictions(features, labels): 'verbose': -1, 'min_data': 5, } - if lgb.get_device_type() == 2: - lgb_params["device"] = "cuda" gbm = lgb.train( params=lgb_params, train_set=dataset, @@ -1282,8 +1239,6 @@ def test_mape_rf(self): 'feature_fraction': 0.8, 'boost_from_average': True } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X, y) gbm = lgb.train(params, lgb_train, num_boost_round=20) pred = gbm.predict(X) @@ -1301,8 +1256,6 @@ def test_mape_dart(self): 'feature_fraction': 0.8, 'boost_from_average': False } - if lgb.get_device_type() == 2: - params["device"] = "cuda" lgb_train = lgb.Dataset(X, y) gbm = lgb.train(params, lgb_train, num_boost_round=40) pred = gbm.predict(X) @@ -1322,8 +1275,6 @@ def check_constant_features(self, y_true, expected_pred, more_params): 'min_data_in_bin': 1, 'boost_from_average': True } - if lgb.get_device_type() == 2: - params["device"] = "cuda" params.update(more_params) lgb_train = lgb.Dataset(X_train, y_train, params=params) gbm = lgb.train(params, lgb_train, num_boost_round=2) @@ -1334,8 +1285,6 @@ def test_constant_features_regression(self): params = { 'objective': 'regression' } - if lgb.get_device_type() == 2: - params["device"] = "cuda" self.check_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params) self.check_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params) self.check_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params) @@ -1344,8 +1293,6 @@ def test_constant_features_binary(self): params = { 'objective': 'binary' } - if lgb.get_device_type() == 2: - params["device"] = "cuda" self.check_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params) self.check_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params) @@ -1354,8 +1301,6 @@ def test_constant_features_multiclass(self): 'objective': 'multiclass', 'num_class': 3 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params) self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params) @@ -1364,8 +1309,6 @@ def test_constant_features_multiclassova(self): 'objective': 'multiclassova', 'num_class': 3 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params) self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params) @@ -1385,8 +1328,6 @@ def preprocess_data(dtrain, dtest, params): X, y = load_iris(True) dataset = lgb.Dataset(X, y, free_raw_data=False) params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1} - if lgb.get_device_type() == 2: - params["device"] = "cuda" results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data) self.assertIn('multi_logloss-mean', results) self.assertEqual(len(results['multi_logloss-mean']), 10) @@ -1399,28 +1340,14 @@ def test_metrics(self): evals_result = {} params_verbose = {'verbose': -1} - if lgb.get_device_type() == 2: - params_verbose["device"] = "cuda" params_obj_verbose = {'objective': 'binary', 'verbose': -1} - if lgb.get_device_type() == 2: - params_obj_verbose["device"] = "cuda" params_obj_metric_log_verbose = {'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1} - if lgb.get_device_type() == 2: - params_obj_metric_log_verbose["device"] = "cuda" params_obj_metric_err_verbose = {'objective': 'binary', 'metric': 'binary_error', 'verbose': -1} - if lgb.get_device_type() == 2: - params_obj_metric_err_verbose["device"] = "cuda" params_obj_metric_inv_verbose = {'objective': 'binary', 'metric': 'invalid_metric', 'verbose': -1} - if lgb.get_device_type() == 2: - params_obj_metric_inv_verbose["device"] = "cuda" params_obj_metric_multi_verbose = {'objective': 'binary', 'metric': ['binary_logloss', 'binary_error'], 'verbose': -1} - if lgb.get_device_type() == 2: - params_obj_metric_multi_verbose["device"] = "cuda" params_obj_metric_none_verbose = {'objective': 'binary', 'metric': 'None', 'verbose': -1} - if lgb.get_device_type() == 2: - params_obj_metric_none_verbose["device"] = "cuda" params_metric_log_verbose = {'metric': 'binary_logloss', 'verbose': -1} params_metric_err_verbose = {'metric': 'binary_error', 'verbose': -1} params_metric_inv_verbose = {'metric_types': 'invalid_metric', 'verbose': -1} @@ -1638,8 +1565,6 @@ def train_booster(params=params_obj_verbose, **kwargs): # remove default metric by 'None' aliases for na_alias in ('None', 'na', 'null', 'custom'): params = {'objective': 'binary', 'metric': na_alias, 'verbose': -1} - if lgb.get_device_type() == 2: - params["device"] = "cuda" train_booster(params=params) self.assertEqual(len(evals_result), 0) @@ -1720,14 +1645,8 @@ def train_booster(params=params_obj_verbose, **kwargs): obj_multi_aliases = ['multiclass', 'softmax', 'multiclassova', 'multiclass_ova', 'ova', 'ovr'] for obj_multi_alias in obj_multi_aliases: params_obj_class_3_verbose = {'objective': obj_multi_alias, 'num_class': 3, 'verbose': -1} - if lgb.get_device_type() == 2: - params_obj_class_3_verbose["device"] = "cuda" params_obj_class_1_verbose = {'objective': obj_multi_alias, 'num_class': 1, 'verbose': -1} - if lgb.get_device_type() == 2: - params_obj_class_1_verbose["device"] = "cuda" params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1} - if lgb.get_device_type() == 2: - params_obj_verbose["device"] = "cuda" # multiclass default metric res = get_cv_result(params_obj_class_3_verbose) self.assertEqual(len(res), 2) @@ -1768,8 +1687,6 @@ def train_booster(params=params_obj_verbose, **kwargs): self.assertRaises(lgb.basic.LightGBMError, get_cv_result, params_obj_class_3_verbose, metrics='binary_logloss') params_class_3_verbose = {'num_class': 3, 'verbose': -1} - if lgb.get_device_type() == 2: - params_class_3_verbose["device"] = "cuda" # non-default num_class for default objective self.assertRaises(lgb.basic.LightGBMError, get_cv_result, params_class_3_verbose) @@ -1904,8 +1821,6 @@ def metrics_combination_train_regression(valid_sets, metric_list, assumed_iterat 'verbose': -1, 'seed': 123 } - if lgb.get_device_type() == 2: - params["device"] = "cuda" gbm = lgb.train(dict(params, first_metric_only=first_metric_only), lgb_train, num_boost_round=25, valid_sets=valid_sets, feval=feval, early_stopping_rounds=5, verbose_eval=False) diff --git a/tests/python_package_test/test_plotting.py b/tests/python_package_test/test_plotting.py index 13ba9859d97..72915914fe1 100644 --- a/tests/python_package_test/test_plotting.py +++ b/tests/python_package_test/test_plotting.py @@ -24,8 +24,6 @@ def setUp(self): "verbose": -1, "num_leaves": 3 } - if lgb.get_device_type() == 2: - self.params["device"] = "cuda" @unittest.skipIf(not MATPLOTLIB_INSTALLED, 'matplotlib is not installed') def test_plot_importance(self): diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 350f3c8f486..cd50805a70b 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -453,8 +453,6 @@ def test_evaluate_train_set(self): def test_metrics(self): X, y = load_boston(True) params = {'n_estimators': 2, 'verbose': -1} - if lgb.get_device_type() == 2: - params["device"] = "cuda" params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} # no custom objective, no custom metric @@ -711,8 +709,6 @@ def test_inf_handle(self): y = np.random.randn(nrows) + np.full(nrows, 1e30) weight = np.full(nrows, 1e10) params = {'n_estimators': 20, 'verbose': -1} - if lgb.get_device_type() == 2: - params["device"] = "cuda" params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y), 'verbose': False, 'early_stopping_rounds': 5} gbm = lgb.LGBMRegressor(**params).fit(**params_fit) @@ -725,8 +721,6 @@ def test_nan_handle(self): y = np.random.randn(nrows) + np.full(nrows, 1e30) weight = np.zeros(nrows) params = {'n_estimators': 20, 'verbose': -1} - if lgb.get_device_type() == 2: - params["device"] = "cuda" params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y), 'verbose': False, 'early_stopping_rounds': 5} gbm = lgb.LGBMRegressor(**params).fit(**params_fit) From 40e37e872d7c953081eea4aac51834fd0c2ed0a3 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 22 Jul 2020 16:15:06 +0000 Subject: [PATCH 101/119] Missed one change form previous check-in --- tests/python_package_test/test_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 286bd2e2a8d..dc48fc9d3a3 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1215,7 +1215,6 @@ def test_refit(self): X, y = load_breast_cancer(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { - 'device': 'cpu', 'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1, From 8878ea47f875d7b7f7b83a732f07bb9c3a08b1e8 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 27 Jul 2020 11:44:12 +0000 Subject: [PATCH 102/119] Removal AdditionConfig and fix settings. --- src/application/application.cpp | 1 - src/c_api.cpp | 44 --------------------------------- src/io/config.cpp | 7 ++++++ src/io/dataset.cpp | 4 ++- 4 files changed, 10 insertions(+), 46 deletions(-) diff --git a/src/application/application.cpp b/src/application/application.cpp index 2d3facdb978..4d1e8b1866b 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -43,7 +43,6 @@ Application::Application(int argc, char** argv) { #ifdef USE_CUDA if (config_.device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; - config_.is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ } #endif } diff --git a/src/c_api.cpp b/src/c_api.cpp index 84d0a25ab08..7a36737352a 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -40,17 +39,6 @@ inline int LGBM_APIHandleException(const std::string& ex) { return -1; } -inline void AdditionalConfig(Config *config) { -#ifdef USE_CUDA - if (config->device_type == std::string("cuda")) { - LightGBM::LGBM_config_::current_device = lgbm_device_cuda; - config->is_enable_sparse = false; /* LGBM_CUDA setting is_enable_sparse to FALSE (default is true) */ - } -#else - (void)(config); // UNUSED -#endif -} - #define API_BEGIN() try { #define API_END() } \ catch(std::exception& ex) { return LGBM_APIHandleException(ex); } \ @@ -123,8 +111,6 @@ class Booster { omp_set_num_threads(config_.num_threads); } - AdditionalConfig(&config_); - // create boosting if (config_.input_model.size() > 0) { Log::Warning("Continued train from model is not supported for c_api,\n" @@ -318,8 +304,6 @@ class Booster { omp_set_num_threads(config_.num_threads); } - AdditionalConfig(&config_); - if (param.count("objective")) { // create objective function objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective, @@ -645,8 +629,6 @@ int LGBM_DatasetCreateFromFile(const char* filename, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - DatasetLoader loader(config, nullptr, 1, filename); if (reference == nullptr) { if (Network::num_machines() == 1) { @@ -678,8 +660,6 @@ int LGBM_DatasetCreateFromSampledColumn(double** sample_data, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - DatasetLoader loader(config, nullptr, 1, nullptr); *out = loader.CostructFromSampleData(sample_data, sample_indices, ncol, num_per_col, num_sample_row, @@ -792,8 +772,6 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - std::unique_ptr ret; int32_t total_nrow = 0; for (int j = 0; j < nmat; ++j) { @@ -886,8 +864,6 @@ int LGBM_DatasetCreateFromCSR(const void* indptr, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - std::unique_ptr ret; auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); int32_t nrow = static_cast(nindptr - 1); @@ -956,8 +932,6 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - std::unique_ptr ret; int32_t nrow = num_rows; if (reference == nullptr) { @@ -1030,8 +1004,6 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - std::unique_ptr ret; int32_t nrow = static_cast(num_row); if (reference == nullptr) { @@ -1116,8 +1088,6 @@ int LGBM_DatasetGetSubset( omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - auto full_dataset = reinterpret_cast(handle); CHECK_GT(num_used_row_indices, 0); const int32_t lower = 0; @@ -1514,8 +1484,6 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - Booster* ref_booster = reinterpret_cast(handle); ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header, config, result_filename); @@ -1561,8 +1529,6 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); int nrow = static_cast(nindptr - 1); @@ -1598,8 +1564,6 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); ref_booster->PredictSingleRow(num_iteration, predict_type, static_cast(num_col), get_row_fun, config, out_result, out_len); @@ -1630,8 +1594,6 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - int num_threads = OMP_NUM_THREADS(); int ncol = static_cast(ncol_ptr - 1); std::vector> iterators(num_threads, std::vector()); @@ -1677,8 +1639,6 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major); ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, @@ -1704,8 +1664,6 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major); ref_booster->PredictSingleRow(num_iteration, predict_type, ncol, get_row_fun, config, out_result, out_len); @@ -1731,8 +1689,6 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, omp_set_num_threads(config.num_threads); } - AdditionalConfig(&config); - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type); ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len); diff --git a/src/io/config.cpp b/src/io/config.cpp index 4c65d158800..8312da591dd 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -8,6 +8,8 @@ #include #include +#include + #include namespace LightGBM { @@ -208,6 +210,11 @@ void Config::Set(const std::unordered_map& params) { GetMetricType(params, &metric); GetObjectiveType(params, &objective); GetDeviceType(params, &device_type); +#ifdef USE_CUDA + if (device_type == std::string("cuda")) { + LightGBM::LGBM_config_::current_device = lgbm_device_cuda; + } +#endif GetTreeLearnerType(params, &tree_learner); GetMembersFromString(params); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 817480d5c50..44e7be3db92 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -336,9 +336,11 @@ void Dataset::Construct(std::vector>* bin_mappers, } auto features_in_group = NoGroup(used_features); + auto is_sparse = io_config.is_enable_sparse; #ifdef USE_CUDA if (io_config.device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; + is_sparse = false; } #endif @@ -349,7 +351,7 @@ void Dataset::Construct(std::vector>* bin_mappers, *bin_mappers, sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast(total_sample_cnt), used_features, num_data_, lgbm_is_gpu_used, - io_config.is_enable_sparse, &group_is_multi_val); + is_sparse, &group_is_multi_val); } num_features_ = 0; From 9ab44b66b1235fd458c82addb3daf5bbf996ba7e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 27 Jul 2020 14:25:54 +0000 Subject: [PATCH 103/119] Limit number of GPUs to one for now in CUDA. --- include/LightGBM/config.h | 4 ---- src/io/config_auto.cpp | 5 ----- src/treelearner/cuda_tree_learner.cpp | 13 +++++-------- src/treelearner/cuda_tree_learner.h | 2 +- 4 files changed, 6 insertions(+), 18 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index c24e9613a95..2a3335c1c0a 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -954,10 +954,6 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) bool gpu_use_dp = false; - // check = >0 - // desc = number of gpus (CUDA implementation only) - int num_gpu = 1; - #pragma endregion #pragma endregion diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index b35e8da49ab..807cad78502 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -294,7 +294,6 @@ const std::unordered_set& Config::parameter_set() { "gpu_platform_id", "gpu_device_id", "gpu_use_dp", - "num_gpu", }); return params; } @@ -606,9 +605,6 @@ void Config::GetMembersFromString(const std::unordered_mapnum_feature_groups(); // Initialize GPU buffers and kernels & LGBM_CUDA: get device info - InitGPU(config_->num_gpu); + InitGPU(); } // some functions used for debugging the GPU histogram construction @@ -435,7 +435,7 @@ void CUDATreeLearner::copyDenseFeature() { // LGBM_CUDA: InitGPU w/ num_gpu -void CUDATreeLearner::InitGPU(int num_gpu) { +void CUDATreeLearner::InitGPU() { // Get the max bin size, used for selecting best GPU kernel max_num_bin_ = 0; @@ -481,13 +481,10 @@ void CUDATreeLearner::InitGPU(int num_gpu) { // LGBM_CUDA: get num_dense_feature_groups_ CountDenseFeatureGroups(); - if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_; - // LGBM_CUDA: initialize GPU - int gpu_count; - - CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count)); - num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu; + CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_gpu_)); + if (num_gpu_ > 1) num_gpu_ = 1; + if (num_gpu_ > num_dense_feature_groups_) num_gpu_ = num_dense_feature_groups_; // LGBM_CUDA: set cpu threads cpu_threads_ = reinterpret_cast(malloc(sizeof(pthread_t *)*num_gpu_)); diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index cc6ad806f85..5f0111015c9 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -77,7 +77,7 @@ class CUDATreeLearner: public SerialTreeLearner { * \brief Initialize GPU device * \LGBM_CUDA: param num_gpu: number of maximum gpus */ - void InitGPU(int num_gpu); + void InitGPU(); /*! * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only From 9f8a01192fa8ead4db04cb42e8afe168cc43e846 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 27 Jul 2020 15:19:58 +0000 Subject: [PATCH 104/119] Update Parameters.rst for previous check-in --- docs/Parameters.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 5bd392d82f8..01362fb9af3 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -1104,10 +1104,6 @@ GPU Parameters - set this to ``true`` to use double precision math on GPU (by default single precision is used) -- ``num_gpu`` :raw-html:`🔗︎`, default = ``1``, type = int, constraints: ``num_gpu > 0`` - - - number of gpus (CUDA implementation only) - .. end params list Others From 5369a8a4738d3a7b5b80a70e44240234bc64c100 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 3 Aug 2020 15:26:22 +0000 Subject: [PATCH 105/119] Whitespace removal. --- src/c_api.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/c_api.cpp b/src/c_api.cpp index 3a0fb59d7a3..0b319cf828d 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -118,7 +118,6 @@ class Booster { if (config_.num_threads > 0) { omp_set_num_threads(config_.num_threads); } - // create boosting if (config_.input_model.size() > 0) { Log::Warning("Continued train from model is not supported for c_api,\n" @@ -891,7 +890,6 @@ int LGBM_DatasetCreateFromFile(const char* filename, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - DatasetLoader loader(config, nullptr, 1, filename); if (reference == nullptr) { if (Network::num_machines() == 1) { @@ -922,7 +920,6 @@ int LGBM_DatasetCreateFromSampledColumn(double** sample_data, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - DatasetLoader loader(config, nullptr, 1, nullptr); *out = loader.ConstructFromSampleData(sample_data, sample_indices, ncol, num_per_col, num_sample_row, @@ -1034,7 +1031,6 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - std::unique_ptr ret; int32_t total_nrow = 0; for (int j = 0; j < nmat; ++j) { @@ -1126,7 +1122,6 @@ int LGBM_DatasetCreateFromCSR(const void* indptr, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - std::unique_ptr ret; auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); int32_t nrow = static_cast(nindptr - 1); @@ -1194,7 +1189,6 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - std::unique_ptr ret; int32_t nrow = num_rows; if (reference == nullptr) { @@ -1266,7 +1260,6 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - std::unique_ptr ret; int32_t nrow = static_cast(num_row); if (reference == nullptr) { @@ -1350,7 +1343,6 @@ int LGBM_DatasetGetSubset( if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - auto full_dataset = reinterpret_cast(handle); CHECK_GT(num_used_row_indices, 0); const int32_t lower = 0; @@ -1746,7 +1738,6 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - Booster* ref_booster = reinterpret_cast(handle); ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header, config, result_filename); @@ -1821,7 +1812,6 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); int nrow = static_cast(nindptr - 1); @@ -1941,7 +1931,6 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); ref_booster->SetSingleRowPredictor(num_iteration, predict_type, config); @@ -2017,7 +2006,6 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - int num_threads = OMP_NUM_THREADS(); int ncol = static_cast(ncol_ptr - 1); std::vector> iterators(num_threads, std::vector()); @@ -2062,7 +2050,6 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major); ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, @@ -2087,7 +2074,6 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major); ref_booster->SetSingleRowPredictor(num_iteration, predict_type, config); @@ -2149,7 +2135,6 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, if (config.num_threads > 0) { omp_set_num_threads(config.num_threads); } - Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type); ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len); From 51e096cf18dd74918b1cdf944a4627fed571f86b Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 3 Aug 2020 19:31:53 +0000 Subject: [PATCH 106/119] Cleanup unused code. --- include/LightGBM/application.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/LightGBM/application.h b/include/LightGBM/application.h index 3fda4a1c32e..66541ec006c 100644 --- a/include/LightGBM/application.h +++ b/include/LightGBM/application.h @@ -36,9 +36,6 @@ class Application { /*! \brief To call this function to run application*/ inline void Run(); - /*! \brief call to get configuration */ - Config GetConfig() {return config_ ;} - private: /*! \brief Load parameters from command line and config file*/ void LoadParameters(int argc, char** argv); From 9ca091b97fade540b5e188f0e331013a8c9db03f Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 5 Aug 2020 15:21:11 +0000 Subject: [PATCH 107/119] Changed uint/ushort/ulong to unsigned int/short/long to help Windows based CUDA compiler work. --- src/treelearner/cuda_kernel_launcher.cu | 48 ++--- src/treelearner/cuda_tree_learner.cpp | 2 +- src/treelearner/cuda_tree_learner.h | 2 +- .../kernels/histogram_16_64_256.cu | 180 +++++++++--------- .../kernels/histogram_16_64_256.hu | 20 +- 5 files changed, 126 insertions(+), 126 deletions(-) diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu index 8b243200878..218be6d72b9 100644 --- a/src/treelearner/cuda_kernel_launcher.cu +++ b/src/treelearner/cuda_kernel_launcher.cu @@ -34,20 +34,20 @@ void cuda_histogram( if (use_all_features) { if (!is_constant_hessian) histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } else { @@ -55,20 +55,20 @@ void cuda_histogram( // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } @@ -77,20 +77,20 @@ void cuda_histogram( if (use_all_features) { if (!is_constant_hessian) histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } else { @@ -98,20 +98,20 @@ void cuda_histogram( // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } @@ -120,20 +120,20 @@ void cuda_histogram( if (use_all_features) { if (!is_constant_hessian) histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram256_fulldata<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram256_fulldata<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } else { @@ -141,20 +141,20 @@ void cuda_histogram( // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + reinterpret_cast(arg3), arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index b4e41acc7fc..067c06635af 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -36,7 +36,7 @@ static void *launch_cuda_histogram(void *thread_data) { td.device_features, td.device_feature_masks, td.num_data, - reinterpret_cast(td.device_data_indices), + reinterpret_cast(td.device_data_indices), td.leaf_num_data, td.device_gradients, td.device_hessians, td.hessians_const, diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 5f0111015c9..1506c5cf21b 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -118,7 +118,7 @@ class CUDATreeLearner: public SerialTreeLearner { td->stream = stream_[device_id]; td->device_features = device_features_[device_id]; td->device_feature_masks = reinterpret_cast(device_feature_masks_[device_id]); - td->device_data_indices = reinterpret_cast(device_data_indices_[device_id]); + td->device_data_indices = reinterpret_cast(device_data_indices_[device_id]); td->device_gradients = device_gradients_[device_id]; td->device_hessians = device_hessians_[device_id]; td->hessians_const = hessians_[0]; diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 09d563cbaf4..d156c872ec8 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -29,29 +29,29 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) { #define KERNEL_NAME histogram16 #endif // ENABLE_ALL_FEATURES #define NUM_BINS 16 -#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) +#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS) // this function will be called by histogram16 // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist, - const uint skip_id, - const uint old_val_cont_bin0, - const ushort num_sub_hist, + const unsigned int skip_id, + const unsigned int old_val_cont_bin0, + const unsigned short num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { - const ushort ltid = threadIdx.x; + const unsigned short ltid = threadIdx.x; acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); + unsigned int* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); - uint cont_bin; + unsigned int cont_bin; if (power_feature_workgroups != 0) { cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; } else { cont_bin = local_cnt[ltid]; } - ushort i; + unsigned short i; if (power_feature_workgroups != 0) { // add all sub-histograms for feature @@ -113,15 +113,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms // otherwise a "Misaligned Address" exception may occur __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; - const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; - const ushort ltid = threadIdx.x; - const ushort lsize = NUM_BINS; // get_local_size(0); - const ushort group_id = blockIdx.x; + const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned short ltid = threadIdx.x; + const unsigned short lsize = NUM_BINS; // get_local_size(0); + const unsigned short group_id = blockIdx.x; // local memory per workgroup is 3 KB // clear local memory - uint *ptr = reinterpret_cast(shared_array); - for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { + unsigned int *ptr = reinterpret_cast(shared_array); + for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) { ptr[i] = 0; } __syncthreads(); @@ -133,25 +133,25 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *gh_hist = reinterpret_cast(shared_array); // counter histogram - // total size: 256 * size_of(uint) = 1 KB - uint *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); + // total size: 256 * size_of(unsigned int) = 1 KB + unsigned int *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); // odd threads (1, 3, ...) compute histograms for hessians first // even thread (0, 2, ...) compute histograms for gradients first // etc. uchar is_hessian_first = ltid & 1; - ushort feature_id = group_id >> power_feature_workgroups; + unsigned short feature_id = group_id >> power_feature_workgroups; // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) // feature_size is the number of examples per feature const uchar *feature_data = feature_data_base + feature_id * feature_size; // size of threads that process this feature4 - const uint subglobal_size = lsize * (1 << power_feature_workgroups); + const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups); // equavalent thread ID in this subgroup for this feature4 - const uint subglobal_tid = gtid - feature_id * subglobal_size; + const unsigned int subglobal_tid = gtid - feature_id * subglobal_size; data_size_t ind; @@ -177,7 +177,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, uchar feature; uchar feature_next; // uint8_t bin; - ushort bin; + unsigned short bin; feature = feature_data[ind >> feature_mask]; if (feature_mask) { @@ -197,7 +197,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 - for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { + for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) { // prefetch the next iteration variables // we don't need bondary check because we have made the buffer large int i_next = i + subglobal_size; @@ -280,22 +280,22 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; - for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { + for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) { // even threads read gradients, odd threads read hessians acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } // write counts acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - uint value = cnt_hist[i]; + for (unsigned short i = ltid; i < NUM_BINS; i += lsize) { + unsigned int value = cnt_hist[i]; ptr_i[i] = value; } __syncthreads(); __threadfence(); - uint * counter_val = cnt_hist; + unsigned int * counter_val = cnt_hist; // backup the old value - uint old_val = *counter_val; + unsigned int old_val = *counter_val; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); @@ -313,15 +313,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { - uint old_val; // dummy + unsigned int old_val; // dummy #endif // locate our feature's block in output memory - uint output_offset = (feature_id << power_feature_workgroups); + unsigned int output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory - // uint skip_id = feature_id ^ output_offset; - uint skip_id = group_id - output_offset; + // unsigned int skip_id = feature_id ^ output_offset; + unsigned int skip_id = group_id - output_offset; // locate output histogram location for this feature4 acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; @@ -347,29 +347,29 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #define KERNEL_NAME histogram64 #endif // ENABLE_ALL_FEATURES #define NUM_BINS 64 -#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) +#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS) // this function will be called by histogram64 // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist, - const uint skip_id, - const uint old_val_cont_bin0, - const ushort num_sub_hist, + const unsigned int skip_id, + const unsigned int old_val_cont_bin0, + const unsigned short num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { - const ushort ltid = threadIdx.x; + const unsigned short ltid = threadIdx.x; acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); + unsigned int* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); - uint cont_bin; + unsigned int cont_bin; if (power_feature_workgroups != 0) { cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; } else { cont_bin = local_cnt[ltid]; } - ushort i; + unsigned short i; if (power_feature_workgroups != 0) { // add all sub-histograms for feature @@ -431,15 +431,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms // otherwise a "Misaligned Address" exception may occur __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; - const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; - const ushort ltid = threadIdx.x; - const ushort lsize = NUM_BINS; // get_local_size(0); - const ushort group_id = blockIdx.x; + const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned short ltid = threadIdx.x; + const unsigned short lsize = NUM_BINS; // get_local_size(0); + const unsigned short group_id = blockIdx.x; // local memory per workgroup is 3 KB // clear local memory - uint *ptr = reinterpret_cast(shared_array); - for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { + unsigned int *ptr = reinterpret_cast(shared_array); + for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) { ptr[i] = 0; } __syncthreads(); @@ -451,25 +451,25 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *gh_hist = reinterpret_cast(shared_array); // counter histogram - // total size: 256 * size_of(uint) = 1 KB - uint *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); + // total size: 256 * size_of(unsigned int) = 1 KB + unsigned int *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); // odd threads (1, 3, ...) compute histograms for hessians first // even thread (0, 2, ...) compute histograms for gradients first // etc. uchar is_hessian_first = ltid & 1; - ushort feature_id = group_id >> power_feature_workgroups; + unsigned short feature_id = group_id >> power_feature_workgroups; // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) // feature_size is the number of examples per feature const uchar *feature_data = feature_data_base + feature_id * feature_size; // size of threads that process this feature4 - const uint subglobal_size = lsize * (1 << power_feature_workgroups); + const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups); // equavalent thread ID in this subgroup for this feature4 - const uint subglobal_tid = gtid - feature_id * subglobal_size; + const unsigned int subglobal_tid = gtid - feature_id * subglobal_size; data_size_t ind; data_size_t ind_next; @@ -494,7 +494,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, uchar feature; uchar feature_next; // uint8_t bin; - ushort bin; + unsigned short bin; feature = feature_data[ind >> feature_mask]; if (feature_mask) { @@ -514,7 +514,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 - for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { + for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) { // prefetch the next iteration variables // we don't need bondary check because we have made the buffer large int i_next = i + subglobal_size; @@ -596,22 +596,22 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; - for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { + for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) { // even threads read gradients, odd threads read hessians acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } // write counts acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - uint value = cnt_hist[i]; + for (unsigned short i = ltid; i < NUM_BINS; i += lsize) { + unsigned int value = cnt_hist[i]; ptr_i[i] = value; } __syncthreads(); __threadfence(); - uint * counter_val = cnt_hist; + unsigned int * counter_val = cnt_hist; // backup the old value - uint old_val = *counter_val; + unsigned int old_val = *counter_val; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); @@ -629,15 +629,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { - uint old_val; // dummy + unsigned int old_val; // dummy #endif // locate our feature's block in output memory - uint output_offset = (feature_id << power_feature_workgroups); + unsigned int output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory - // uint skip_id = feature_id ^ output_offset; - uint skip_id = group_id - output_offset; + // unsigned int skip_id = feature_id ^ output_offset; + unsigned int skip_id = group_id - output_offset; // locate output histogram location for this feature4 acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; @@ -663,29 +663,29 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #define KERNEL_NAME histogram256 #endif // ENABLE_ALL_FEATURES #define NUM_BINS 256 -#define LOCAL_MEM_SIZE ((sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS) +#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS) // this function will be called by histogram256 // we have one sub-histogram of one feature in local memory, and need to read others inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist, - const uint skip_id, - const uint old_val_cont_bin0, - const ushort num_sub_hist, + const unsigned int skip_id, + const unsigned int old_val_cont_bin0, + const unsigned short num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { - const ushort ltid = threadIdx.x; + const unsigned short ltid = threadIdx.x; acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; - uint* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); + unsigned int* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); - uint cont_bin; + unsigned int cont_bin; if (power_feature_workgroups != 0) { cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; } else { cont_bin = local_cnt[ltid]; } - ushort i; + unsigned short i; if (power_feature_workgroups != 0) { // add all sub-histograms for feature @@ -748,15 +748,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms // otherwise a "Misaligned Address" exception may occur __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; - const uint gtid = blockIdx.x * blockDim.x + threadIdx.x; - const ushort ltid = threadIdx.x; - const ushort lsize = NUM_BINS; // get_local_size(0); - const ushort group_id = blockIdx.x; + const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned short ltid = threadIdx.x; + const unsigned short lsize = NUM_BINS; // get_local_size(0); + const unsigned short group_id = blockIdx.x; // local memory per workgroup is 3 KB // clear local memory - uint *ptr = reinterpret_cast(shared_array); - for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) { + unsigned int *ptr = reinterpret_cast(shared_array); + for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) { ptr[i] = 0; } __syncthreads(); @@ -768,25 +768,25 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *gh_hist = reinterpret_cast(shared_array); // counter histogram - // total size: 256 * size_of(uint) = 1 KB - uint *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); + // total size: 256 * size_of(unsigned int) = 1 KB + unsigned int *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); // odd threads (1, 3, ...) compute histograms for hessians first // even thread (0, 2, ...) compute histograms for gradients first // etc. uchar is_hessian_first = ltid & 1; - ushort feature_id = group_id >> power_feature_workgroups; + unsigned short feature_id = group_id >> power_feature_workgroups; // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) // feature_size is the number of examples per feature const uchar *feature_data = feature_data_base + feature_id * feature_size; // size of threads that process this feature4 - const uint subglobal_size = lsize * (1 << power_feature_workgroups); + const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups); // equavalent thread ID in this subgroup for this feature4 - const uint subglobal_tid = gtid - feature_id * subglobal_size; + const unsigned int subglobal_tid = gtid - feature_id * subglobal_size; data_size_t ind; data_size_t ind_next; @@ -811,7 +811,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, uchar feature; uchar feature_next; // uint8_t bin; - ushort bin; + unsigned short bin; feature = feature_data[ind >> feature_mask]; if (feature_mask) { @@ -831,7 +831,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #endif // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 - for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { + for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) { // prefetch the next iteration variables // we don't need bondary check because we have made the buffer large int i_next = i + subglobal_size; @@ -913,22 +913,22 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; - for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) { + for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) { // even threads read gradients, odd threads read hessians acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } // write counts acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - uint value = cnt_hist[i]; + for (unsigned short i = ltid; i < NUM_BINS; i += lsize) { + unsigned int value = cnt_hist[i]; ptr_i[i] = value; } __syncthreads(); __threadfence(); - uint * counter_val = cnt_hist; + unsigned int * counter_val = cnt_hist; // backup the old value - uint old_val = *counter_val; + unsigned int old_val = *counter_val; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); @@ -946,15 +946,15 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // only 1 work group, no need to increase counter // the reduction will become a simple copy if (1) { - uint old_val; // dummy + unsigned int old_val; // dummy #endif // locate our feature's block in output memory - uint output_offset = (feature_id << power_feature_workgroups); + unsigned int output_offset = (feature_id << power_feature_workgroups); acc_type const * __restrict__ feature_subhists = reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory - // uint skip_id = feature_id ^ output_offset; - uint skip_id = group_id - output_offset; + // unsigned int skip_id = feature_id ^ output_offset; + unsigned int skip_id = group_id - output_offset; // locate output histogram location for this feature4 acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu index e228d3b0068..8cc464dfb32 100644 --- a/src/treelearner/kernels/histogram_16_64_256.hu +++ b/src/treelearner/kernels/histogram_16_64_256.hu @@ -28,9 +28,9 @@ __device__ double as_double(const T t) { return d; } template -__device__ ulong as_ulong(const T t) { - static_assert(sizeof(T) == sizeof(ulong), "size mismatch"); - ulong u; +__device__ unsigned long long as_ulong_ulong(const T t) { + static_assert(sizeof(T) == sizeof(unsigned long long), "size mismatch"); + unsigned long long u; memcpy(&u, &t, sizeof(T)); return u; } @@ -42,9 +42,9 @@ __device__ float as_float(const T t) { return f; } template -__device__ uint as_uint(const T t) { - static_assert(sizeof(T) == sizeof(uint), "size_mismatch"); - uint u; +__device__ unsigned int as_uint(const T t) { + static_assert(sizeof(T) == sizeof(unsigned int), "size_mismatch"); + unsigned int u; memcpy(&u, &t, sizeof(T)); return u; } @@ -58,12 +58,12 @@ __device__ uchar4 as_uchar4(const T t) { #if USE_DP_FLOAT == 1 typedef double acc_type; -typedef ulong acc_int_type; +typedef unsigned long long acc_int_type; #define as_acc_type as_double -#define as_acc_int_type as_ulong +#define as_acc_int_type as_ulong_ulong #else typedef float acc_type; -typedef uint acc_int_type; +typedef unsigned int acc_int_type; #define as_acc_type as_float #define as_acc_int_type as_uint #endif @@ -73,7 +73,7 @@ typedef uint acc_int_type; #define ENABLE_ALL_FEATURES 1 #endif -typedef uint data_size_t; +typedef unsigned int data_size_t; typedef float score_t; // define all of the different kernels From 7fcecff7db1d19525636fd3bbb591385b2b72261 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 5 Aug 2020 16:47:57 +0000 Subject: [PATCH 108/119] Lint change from previous check-in. --- .../kernels/histogram_16_64_256.cu | 64 ++++++++++--------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index d156c872ec8..f57e8f9d838 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -4,7 +4,9 @@ */ #include "histogram_16_64_256.hu" -#include "stdio.h" + +#include +#include #define PRINT(b, t, fmt, ...) \ if (b == gtid && t == ltid) { \ @@ -36,11 +38,11 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) { inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist, const unsigned int skip_id, const unsigned int old_val_cont_bin0, - const unsigned short num_sub_hist, + const uint16_t num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { - const unsigned short ltid = threadIdx.x; + const uint16_t ltid = threadIdx.x; acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; unsigned int* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); @@ -51,7 +53,7 @@ inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ } else { cont_bin = local_cnt[ltid]; } - unsigned short i; + uint16_t i; if (power_feature_workgroups != 0) { // add all sub-histograms for feature @@ -114,9 +116,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // otherwise a "Misaligned Address" exception may occur __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned short ltid = threadIdx.x; - const unsigned short lsize = NUM_BINS; // get_local_size(0); - const unsigned short group_id = blockIdx.x; + const uint16_t ltid = threadIdx.x; + const uint16_t lsize = NUM_BINS; // get_local_size(0); + const uint16_t group_id = blockIdx.x; // local memory per workgroup is 3 KB // clear local memory @@ -141,7 +143,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // etc. uchar is_hessian_first = ltid & 1; - unsigned short feature_id = group_id >> power_feature_workgroups; + uint16_t feature_id = group_id >> power_feature_workgroups; // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) // feature_size is the number of examples per feature @@ -177,7 +179,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, uchar feature; uchar feature_next; // uint8_t bin; - unsigned short bin; + uint16_t bin; feature = feature_data[ind >> feature_mask]; if (feature_mask) { @@ -280,14 +282,14 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; - for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) { + for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) { // even threads read gradients, odd threads read hessians acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } // write counts acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); - for (unsigned short i = ltid; i < NUM_BINS; i += lsize) { + for (uint16_t i = ltid; i < NUM_BINS; i += lsize) { unsigned int value = cnt_hist[i]; ptr_i[i] = value; } @@ -354,11 +356,11 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist, const unsigned int skip_id, const unsigned int old_val_cont_bin0, - const unsigned short num_sub_hist, + const uint16_t num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { - const unsigned short ltid = threadIdx.x; + const uint16_t ltid = threadIdx.x; acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; unsigned int* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); @@ -369,7 +371,7 @@ inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ } else { cont_bin = local_cnt[ltid]; } - unsigned short i; + uint16_t i; if (power_feature_workgroups != 0) { // add all sub-histograms for feature @@ -432,9 +434,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // otherwise a "Misaligned Address" exception may occur __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned short ltid = threadIdx.x; - const unsigned short lsize = NUM_BINS; // get_local_size(0); - const unsigned short group_id = blockIdx.x; + const uint16_t ltid = threadIdx.x; + const uint16_t lsize = NUM_BINS; // get_local_size(0); + const uint16_t group_id = blockIdx.x; // local memory per workgroup is 3 KB // clear local memory @@ -459,7 +461,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // etc. uchar is_hessian_first = ltid & 1; - unsigned short feature_id = group_id >> power_feature_workgroups; + uint16_t feature_id = group_id >> power_feature_workgroups; // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) // feature_size is the number of examples per feature @@ -494,7 +496,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, uchar feature; uchar feature_next; // uint8_t bin; - unsigned short bin; + uint16_t bin; feature = feature_data[ind >> feature_mask]; if (feature_mask) { @@ -596,14 +598,14 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; - for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) { + for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) { // even threads read gradients, odd threads read hessians acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } // write counts acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); - for (unsigned short i = ltid; i < NUM_BINS; i += lsize) { + for (uint16_t i = ltid; i < NUM_BINS; i += lsize) { unsigned int value = cnt_hist[i]; ptr_i[i] = value; } @@ -670,11 +672,11 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist, const unsigned int skip_id, const unsigned int old_val_cont_bin0, - const unsigned short num_sub_hist, + const uint16_t num_sub_hist, acc_type* __restrict__ output_buf, acc_type* __restrict__ local_hist, const size_t power_feature_workgroups) { - const unsigned short ltid = threadIdx.x; + const uint16_t ltid = threadIdx.x; acc_type grad_bin = local_hist[ltid * 2]; acc_type hess_bin = local_hist[ltid * 2 + 1]; unsigned int* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); @@ -685,7 +687,7 @@ inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ } else { cont_bin = local_cnt[ltid]; } - unsigned short i; + uint16_t i; if (power_feature_workgroups != 0) { // add all sub-histograms for feature @@ -749,9 +751,9 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // otherwise a "Misaligned Address" exception may occur __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned short ltid = threadIdx.x; - const unsigned short lsize = NUM_BINS; // get_local_size(0); - const unsigned short group_id = blockIdx.x; + const uint16_t ltid = threadIdx.x; + const uint16_t lsize = NUM_BINS; // get_local_size(0); + const uint16_t group_id = blockIdx.x; // local memory per workgroup is 3 KB // clear local memory @@ -776,7 +778,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // etc. uchar is_hessian_first = ltid & 1; - unsigned short feature_id = group_id >> power_feature_workgroups; + uint16_t feature_id = group_id >> power_feature_workgroups; // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) // feature_size is the number of examples per feature @@ -811,7 +813,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, uchar feature; uchar feature_next; // uint8_t bin; - unsigned short bin; + uint16_t bin; feature = feature_data[ind >> feature_mask]; if (feature_mask) { @@ -913,14 +915,14 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; // write gradients and hessians acc_type *__restrict__ ptr_f = output; - for (unsigned short i = ltid; i < 2 * NUM_BINS; i += lsize) { + for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) { // even threads read gradients, odd threads read hessians acc_type value = gh_hist[i]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } // write counts acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); - for (unsigned short i = ltid; i < NUM_BINS; i += lsize) { + for (uint16_t i = ltid; i < NUM_BINS; i += lsize) { unsigned int value = cnt_hist[i]; ptr_i[i] = value; } From 05274d4af60d50d8e2680a38e8d584113d3f6228 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 14 Aug 2020 11:58:29 +0000 Subject: [PATCH 109/119] Changes based on reviewers comments. --- CMakeLists.txt | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 58e5d86632c..2c9157fb034 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,18 +1,12 @@ if(USE_GPU OR APPLE) cmake_minimum_required(VERSION 3.2) elseif(USE_CUDA) - cmake_minimum_required(VERSION 3.11) - enable_language(CUDA) + cmake_minimum_required(VERSION 3.16) + PROJECT(lightgbm LANGUAGES CUDA) else() cmake_minimum_required(VERSION 2.8) endif() -if(USE_CUDA) - PROJECT(lightgbm LANGUAGES C CXX CUDA) -else() - PROJECT(lightgbm LANGUAGES C CXX) -endif() - PROJECT(lightgbm) OPTION(USE_MPI "Enable MPI-based parallel learning" OFF) @@ -135,20 +129,19 @@ endif(USE_GPU) if(USE_CUDA) find_package(CUDA REQUIRED) + SET(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE) include_directories(${CUDA_INCLUDE_DIRS}) - LIST(APPEND CMAKE_CUDA_FLAGS -g -Xcompiler=-fopenmp -Xcompiler=-fPIC -Xcompiler=-Wall -lineinfo) - CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS 7.0) + LIST(APPEND CMAKE_CUDA_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall) + CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS 6.0 6.1 6.2 7.0 7.5+PTX) LIST(APPEND CMAKE_CUDA_FLAGS ${CUDA_ARCH_FLAGS}) - if(CMAKE_BUILD_TYPE MATCHES Release) - LIST(APPEND CMAKE_CUDA_FLAGS -03) + if(USE_DEBUG) + SET(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g") + else() + SET(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -lineinfo") endif() - - message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") - set(CMAKE_CUDA_FLAGS_DEBUG -G) - set(CMAKE_CUDA_FLAGS_RELEASE -lineinfo) ADD_DEFINITIONS(-DUSE_CUDA) if (NOT DEFINED CMAKE_CUDA_STANDARD) @@ -381,12 +374,12 @@ if(USE_GPU) endif(USE_GPU) if(USE_CUDA) - set_property(TARGET lightgbm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + set_target_properties(lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) TARGET_LINK_LIBRARIES( lightgbm ${histograms} ) - set_property(TARGET _lightgbm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + set_target_properties(_lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) TARGET_LINK_LIBRARIES( _lightgbm ${histograms} From 8bb20d3433c2b3e672cdd842c6ec64749819b850 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 17 Aug 2020 13:38:23 +0000 Subject: [PATCH 110/119] More reviewer comment changes. --- CMakeLists.txt | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c9157fb034..79870e8c54d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,12 +2,15 @@ if(USE_GPU OR APPLE) cmake_minimum_required(VERSION 3.2) elseif(USE_CUDA) cmake_minimum_required(VERSION 3.16) - PROJECT(lightgbm LANGUAGES CUDA) else() cmake_minimum_required(VERSION 2.8) endif() -PROJECT(lightgbm) +if(USE_CUDA) + PROJECT(lightgbm LANGUAGES C CXX CUDA) +else() + PROJECT(lightgbm LANGUAGES C CXX) +endif() OPTION(USE_MPI "Enable MPI-based parallel learning" OFF) OPTION(USE_OPENMP "Enable OpenMP" ON) @@ -98,6 +101,10 @@ else() ADD_DEFINITIONS(-DUSE_SOCKET) endif(USE_MPI) +if(USE_CUDA) + SET(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE) +endif(USE_CUDA) + if(USE_OPENMP) find_package(OpenMP REQUIRED) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") @@ -129,7 +136,6 @@ endif(USE_GPU) if(USE_CUDA) find_package(CUDA REQUIRED) - SET(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE) include_directories(${CUDA_INCLUDE_DIRS}) LIST(APPEND CMAKE_CUDA_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall) CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS 6.0 6.1 6.2 7.0 7.5+PTX) @@ -187,7 +193,6 @@ if(USE_CUDA) add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}") add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}") endforeach() - endif(USE_CUDA) if(USE_HDFS) From cc6d348cf77323a1ab09341f692d89ace1d5484f Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 21 Aug 2020 18:44:21 +0000 Subject: [PATCH 111/119] Adding warning for is_sparse. Revert tmp_subset code. Only return FeatureGroupData if not is_multi_val_ --- include/LightGBM/feature_group.h | 3 +++ src/boosting/gbdt.cpp | 15 +++------------ src/io/dataset.cpp | 3 +++ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 2e0db94f19c..3ba5c143f85 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -233,6 +233,9 @@ class FeatureGroup { } inline void* FeatureGroupData() { + if (is_multi_val_) { + return nullptr; + } return bin_data_->get_data(); } diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index b63bdc1ec0f..2c9fb3b734e 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -801,18 +801,9 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { } } else { bag_data_cnt_ = num_data_; - if (config_->device_type == std::string("cuda")) { - if (tmp_subset_ == nullptr) { - tmp_subset_.reset(new Dataset(bag_data_cnt_)); - tmp_subset_->CopyFeatureMapperFrom(train_data_); - is_use_subset_ = false; - bag_data_indices_.clear(); - } - } else { - bag_data_indices_.clear(); - bagging_runner_.ReSize(0); - is_use_subset_ = false; - } + bag_data_indices_.clear(); + bagging_runner_.ReSize(0); + is_use_subset_ = false; } } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 44e7be3db92..b7689288874 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -340,6 +340,9 @@ void Dataset::Construct(std::vector>* bin_mappers, #ifdef USE_CUDA if (io_config.device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; + if (is_sparse) { + Log::Warning("Using sparse features with CUDA is currently not supported."); + } is_sparse = false; } #endif From 5f3f1e023dce5abaa0c4e13761662bd7da7d1888 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 24 Aug 2020 11:48:58 +0000 Subject: [PATCH 112/119] Fix so that CUDA code will compile even if you enable the SCORE_T_USE_DOUBLE define. --- src/c_api.cpp | 6 +- src/treelearner/cuda_kernel_launcher.cu | 76 ++++++++++--------- src/treelearner/cuda_kernel_launcher.h | 3 + src/treelearner/cuda_tree_learner.cpp | 6 +- src/treelearner/cuda_tree_learner.h | 2 +- .../kernels/histogram_16_64_256.cu | 7 +- .../kernels/histogram_16_64_256.hu | 12 +-- 7 files changed, 65 insertions(+), 47 deletions(-) diff --git a/src/c_api.cpp b/src/c_api.cpp index 61b3038e660..a389e8e47b1 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -1611,10 +1611,14 @@ int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle, const float* hess, int* is_finished) { API_BEGIN(); - Booster* ref_booster = reinterpret_cast(handle); #ifdef SCORE_T_USE_DOUBLE + (void) handle; // UNUSED VARIABLE + (void) grad; // UNUSED VARIABLE + (void) hess; // UNUSED VARIABLE + (void) is_finished; // UNUSED VARIABLE Log::Fatal("Don't support custom loss function when SCORE_T_USE_DOUBLE is enabled"); #else + Booster* ref_booster = reinterpret_cast(handle); if (ref_booster->TrainOneIter(grad, hess)) { *is_finished = 1; } else { diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu index 218be6d72b9..87265ffd956 100644 --- a/src/treelearner/cuda_kernel_launcher.cu +++ b/src/treelearner/cuda_kernel_launcher.cu @@ -9,6 +9,8 @@ #include #include +namespace LightGBM { + void cuda_histogram( int histogram_size, data_size_t leaf_num_data, @@ -34,20 +36,20 @@ void cuda_histogram( if (use_all_features) { if (!is_constant_hessian) histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram16_fulldata<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } else { @@ -55,20 +57,20 @@ void cuda_histogram( // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram16<<<16*num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } @@ -77,20 +79,20 @@ void cuda_histogram( if (use_all_features) { if (!is_constant_hessian) histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram64_fulldata<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } else { @@ -98,20 +100,20 @@ void cuda_histogram( // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram64<<<4*num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } @@ -120,20 +122,20 @@ void cuda_histogram( if (use_all_features) { if (!is_constant_hessian) histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram256_fulldata<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram256_fulldata<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } else { @@ -141,24 +143,26 @@ void cuda_histogram( // seems all features is always enabled, so this should be the same as fulldata if (!is_constant_hessian) histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } else { if (!is_constant_hessian) histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, - static_cast(arg6), arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); + arg3, arg4, arg5, + arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); else histogram256<<>>(arg0, arg1, arg2, - reinterpret_cast(arg3), arg4, arg5, + arg3, arg4, arg5, arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); } } } } +} // namespace LightGBM + #endif // USE_CUDA diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h index 6b6b7cb4b01..faa2b436de2 100644 --- a/src/treelearner/cuda_kernel_launcher.h +++ b/src/treelearner/cuda_kernel_launcher.h @@ -9,6 +9,8 @@ #include #include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t +namespace LightGBM { + struct ThreadData { // device id int device_id; @@ -64,6 +66,7 @@ void cuda_histogram( void* arg9, size_t exp_workgroups_per_feature); +} // namespace LightGBM #endif // USE_CUDA #endif // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_ diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 067c06635af..306d6700c22 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -19,6 +19,8 @@ #include "../io/dense_bin.hpp" +namespace LightGBM { + #define cudaMemcpy_DEBUG 0 // 1: DEBUG cudaMemcpy #define ResetTrainingData_DEBUG 0 // 1: Debug ResetTrainingData @@ -36,7 +38,7 @@ static void *launch_cuda_histogram(void *thread_data) { td.device_features, td.device_feature_masks, td.num_data, - reinterpret_cast(td.device_data_indices), + td.device_data_indices, td.leaf_num_data, td.device_gradients, td.device_hessians, td.hessians_const, @@ -49,8 +51,6 @@ static void *launch_cuda_histogram(void *thread_data) { return NULL; } -namespace LightGBM { - CUDATreeLearner::CUDATreeLearner(const Config* config) :SerialTreeLearner(config) { use_bagging_ = false; diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 1506c5cf21b..e8bc9d331f7 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -118,7 +118,7 @@ class CUDATreeLearner: public SerialTreeLearner { td->stream = stream_[device_id]; td->device_features = device_features_[device_id]; td->device_feature_masks = reinterpret_cast(device_feature_masks_[device_id]); - td->device_data_indices = reinterpret_cast(device_data_indices_[device_id]); + td->device_data_indices = device_data_indices_[device_id]; td->device_gradients = device_gradients_[device_id]; td->device_hessians = device_hessians_[device_id]; td->hessians_const = hessians_[0]; diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index f57e8f9d838..5c7cfdb4a9e 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -3,11 +3,15 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include "histogram_16_64_256.hu" +#include #include #include +#include "histogram_16_64_256.hu" + +namespace LightGBM { + #define PRINT(b, t, fmt, ...) \ if (b == gtid && t == ltid) { \ printf(fmt, __VA_ARGS__); \ @@ -966,3 +970,4 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // end of histogram256 stuff +} // namespace LightGBM diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu index 8cc464dfb32..8e3d3a5ec78 100644 --- a/src/treelearner/kernels/histogram_16_64_256.hu +++ b/src/treelearner/kernels/histogram_16_64_256.hu @@ -6,7 +6,9 @@ #ifndef LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_ #define LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_ -//#pragma once +#include "LightGBM/meta.h" + +namespace LightGBM { // use double precision or not #ifndef USE_DP_FLOAT @@ -73,9 +75,6 @@ typedef unsigned int acc_int_type; #define ENABLE_ALL_FEATURES 1 #endif -typedef unsigned int data_size_t; -typedef float score_t; - // define all of the different kernels #define DECLARE_CONST_BUF(name) \ @@ -156,4 +155,7 @@ DECLARE(histogram256_allfeats); DECLARE(histogram256_fulldata); DECLARE(histogram256); -#endif // _HITOGRAM_256_KERNEL_ +} // namespace LightGBM + +#endif // LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_ + From 676807a0da6ca2da2f5b34294f6d5c5b5b61138c Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 24 Aug 2020 15:03:08 +0000 Subject: [PATCH 113/119] Reviewer comment cleanup. --- include/LightGBM/cuda/cuda_utils.h | 8 +- include/LightGBM/cuda/vector_cudahost.h | 32 +++-- src/application/application.cpp | 2 +- src/boosting/gbdt.h | 2 +- src/io/config.cpp | 16 +-- src/io/dataset.cpp | 2 +- src/io/dense_bin.hpp | 1 - src/treelearner/cuda_kernel_launcher.cu | 3 + src/treelearner/cuda_kernel_launcher.h | 2 - src/treelearner/cuda_tree_learner.cpp | 133 +++++++++--------- src/treelearner/cuda_tree_learner.h | 65 ++++----- .../kernels/histogram_16_64_256.cu | 45 +++--- src/treelearner/parallel_tree_learner.h | 2 +- src/treelearner/serial_tree_learner.h | 2 +- src/treelearner/tree_learner.cpp | 2 +- 15 files changed, 144 insertions(+), 173 deletions(-) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 3c0264cb396..b94b12d1c92 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -2,8 +2,8 @@ * Copyright (c) 2020 IBM Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifndef LIGHTGBM_CUDA_UTILS_H_ -#define LIGHTGBM_CUDA_UTILS_H_ +#ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_ +#define LIGHTGBM_CUDA_CUDA_UTILS_H_ #ifdef USE_CUDA @@ -19,6 +19,6 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = } } -#endif /* USE_CUDA */ +#endif // USE_CUDA -#endif +#endif // LIGHTGBM_CUDA_CUDA_UTILS_H_ diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index a5d97370261..60a82cc8391 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -5,21 +5,27 @@ #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ #define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ +#include + #ifdef USE_CUDA #include #include #endif #include -namespace LightGBM { +enum LGBM_Device { + lgbm_device_cpu, + lgbm_device_gpu, + lgbm_device_cuda +}; -#define lgbm_device_cpu 0 -#define lgbm_device_gpu 1 -#define lgbm_device_cuda 2 +enum Use_Learner { + use_cpu_learner, + use_gpu_learner, + use_cuda_learner +}; -#define use_cpu_learner 0 -#define use_gpu_learner 1 -#define use_cuda_learner 2 +namespace LightGBM { class LGBM_config_ { public: @@ -43,13 +49,13 @@ struct CHAllocator { cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); if (ret != cudaSuccess) { fprintf(stderr, " TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr); - ptr = reinterpret_cast(malloc(n*sizeof(T))); + ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); } } else { - ptr = reinterpret_cast(malloc(n*sizeof(T))); + ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); } #else - ptr = reinterpret_cast(malloc(n*sizeof(T))); + ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); #endif return ptr; } @@ -65,10 +71,10 @@ struct CHAllocator { cudaFreeHost(p); } } else { - free(p); + _mm_free(p); } #else - free(p); + _mm_free(p); #endif } }; @@ -77,4 +83,4 @@ bool operator==(const CHAllocator&, const CHAllocator&); template bool operator!=(const CHAllocator&, const CHAllocator&); -#endif +#endif // LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ diff --git a/src/application/application.cpp b/src/application/application.cpp index bd8f103acbe..c62cdd711e0 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -11,10 +11,10 @@ #include #include #include +#include #include #include #include -#include #include #include diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 865a64dfe3b..0d38385d5f0 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -8,9 +8,9 @@ #include #include #include +#include #include #include -#include #include #include diff --git a/src/io/config.cpp b/src/io/config.cpp index 8312da591dd..f0a9544e3b9 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -4,12 +4,11 @@ */ #include +#include #include #include #include -#include - #include namespace LightGBM { @@ -328,20 +327,15 @@ void Config::CheckParamConflict() { num_leaves = static_cast(full_num_leaves); } } - // force col-wise for gpu - if (device_type == std::string("gpu")) { - force_col_wise = true; - force_row_wise = false; - } - - // force col-wise for CUDA - if (device_type == std::string("cuda")) { + // force col-wise for gpu & CUDA + if (device_type == std::string("gpu") || device_type == std::string("cuda")) { force_col_wise = true; force_row_wise = false; } // force gpu_use_dp for CUDA - if (device_type == std::string("cuda")) { + if (device_type == std::string("cuda") && !gpu_use_dp) { + Log::Warning("CUDA currently requires double precision calculations."); gpu_use_dp = true; } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index b7689288874..2d9693bc695 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -6,10 +6,10 @@ #include #include +#include #include #include #include -#include #include #include diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index c5a95d6af79..4a1cc43fa79 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu index 87265ffd956..8ceb5b813c9 100644 --- a/src/treelearner/cuda_kernel_launcher.cu +++ b/src/treelearner/cuda_kernel_launcher.cu @@ -5,8 +5,11 @@ #ifdef USE_CUDA #include "cuda_kernel_launcher.h" + #include + #include + #include namespace LightGBM { diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h index faa2b436de2..0714e05b2f2 100644 --- a/src/treelearner/cuda_kernel_launcher.h +++ b/src/treelearner/cuda_kernel_launcher.h @@ -24,9 +24,7 @@ struct ThreadData { cudaStream_t stream; uint8_t* device_features; uint8_t* device_feature_masks; - // data_size_t num_data; data_size_t* device_data_indices; - // data_size_t leaf_num_data; score_t* device_gradients; score_t* device_hessians; score_t hessians_const; diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 306d6700c22..813f99d4ba8 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -5,17 +5,17 @@ #ifdef USE_CUDA #include "cuda_tree_learner.h" -#include -#include #include +#include +#include +#include +#include #include -#include - #include -#include #include +#include #include "../io/dense_bin.hpp" @@ -24,7 +24,7 @@ namespace LightGBM { #define cudaMemcpy_DEBUG 0 // 1: DEBUG cudaMemcpy #define ResetTrainingData_DEBUG 0 // 1: Debug ResetTrainingData -#define GPU_DEBUG 0 +#define CUDA_DEBUG 0 static void *launch_cuda_histogram(void *thread_data) { ThreadData td = *(reinterpret_cast(thread_data)); @@ -33,18 +33,18 @@ static void *launch_cuda_histogram(void *thread_data) { // launch cuda kernel cuda_histogram(td.histogram_size, - td.leaf_num_data, td.num_data, td.use_all_features, - td.is_constant_hessian, td.num_workgroups, td.stream, - td.device_features, - td.device_feature_masks, - td.num_data, - td.device_data_indices, - td.leaf_num_data, - td.device_gradients, - td.device_hessians, td.hessians_const, - td.device_subhistograms, td.sync_counters, - td.device_histogram_outputs, - td.exp_workgroups_per_feature); + td.leaf_num_data, td.num_data, td.use_all_features, + td.is_constant_hessian, td.num_workgroups, td.stream, + td.device_features, + td.device_feature_masks, + td.num_data, + td.device_data_indices, + td.leaf_num_data, + td.device_gradients, + td.device_hessians, td.hessians_const, + td.device_subhistograms, td.sync_counters, + td.device_histogram_outputs, + td.exp_workgroups_per_feature); CUDASUCCESS_OR_FATAL(cudaGetLastError()); @@ -73,12 +73,12 @@ void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) // some additional variables needed for GPU trainer num_feature_groups_ = train_data_->num_feature_groups(); - // Initialize GPU buffers and kernels & LGBM_CUDA: get device info + // Initialize GPU buffers and kernels: get device info InitGPU(); } // some functions used for debugging the GPU histogram construction -#if GPU_DEBUG > 0 +#if CUDA_DEBUG > 0 void PrintHistograms(hist_t* h, size_t size) { double total_hess = 0; @@ -169,7 +169,7 @@ int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { int exp_workgroups_per_feature = static_cast(ceil(log2(x))); double t = leaf_num_data / 1024.0; - Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons" + Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons\n" "Best workgroup size per feature for full utilization is %d\n", static_cast(ceil(t)), (1 << exp_workgroups_per_feature)); exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast(ceil(log(static_cast(t))/log(2.0)))); @@ -188,7 +188,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data); std::vector num_gpu_workgroups; - ThreadData *thread_data = reinterpret_cast(malloc(sizeof(ThreadData) * num_gpu_)); + ThreadData *thread_data = reinterpret_cast(_mm_malloc(sizeof(ThreadData) * num_gpu_, 16)); for (int device_id = 0; device_id < num_gpu_; ++device_id) { int num_gpu_feature_groups = num_gpu_feature_groups_[device_id]; @@ -197,7 +197,7 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu if (num_workgroups > preallocd_max_num_wg_[device_id]) { preallocd_max_num_wg_.at(device_id) = num_workgroups; CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id])); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast(num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)))); } // set thread_data SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features, @@ -206,16 +206,14 @@ void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featu for (int device_id = 0; device_id < num_gpu_; ++device_id) { if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, reinterpret_cast(&thread_data[device_id]))) { - fprintf(stderr, "Error in creating threads. Exiting\n"); - exit(0); + Log::Fatal("Error in creating threads."); } } /* Wait for the threads to finish */ for (int device_id = 0; device_id < num_gpu_; ++device_id) { if (pthread_join(*(cpu_threads_[device_id]), NULL)) { - fprintf(stderr, "Error in joining threads. Exiting\n"); - exit(0); + Log::Fatal("Error in joining threads."); } } @@ -287,7 +285,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { return; } - // LGBM_CUDA: calculate number of feature groups per gpu + // calculate number of feature groups per gpu num_gpu_feature_groups_.resize(num_gpu_); offset_gpu_feature_groups_.resize(num_gpu_); int num_features_per_gpu = num_dense_feature_groups_ / num_gpu_; @@ -297,7 +295,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { for (int i = 0; i < num_gpu_; ++i) { offset_gpu_feature_groups_.at(i) = offset; - num_gpu_feature_groups_.at(i) = (i < remain_features)? num_features_per_gpu + 1 : num_features_per_gpu; + num_gpu_feature_groups_.at(i) = (i < remain_features) ? num_features_per_gpu + 1 : num_features_per_gpu; offset += num_gpu_feature_groups_.at(i); } @@ -317,7 +315,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { nthreads_ = std::max(nthreads_, 1); } -// LGBM_CUDA: allocate GPU memory for each GPU +// allocate GPU memory for each GPU void CUDATreeLearner::AllocateGPUMemory() { #pragma omp parallel for schedule(static, num_gpu_) @@ -328,11 +326,11 @@ void CUDATreeLearner::AllocateGPUMemory() { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); // allocate memory for all features - if ( device_features_[device_id] != NULL ) { - CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id])); + if (device_features_[device_id] != NULL) { + CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id])); } - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_features_[device_id]), (size_t)num_gpu_feature_groups * num_data_ * sizeof(uint8_t))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_features_[device_id]), static_cast(num_gpu_feature_groups * num_data_ * sizeof(uint8_t)))); Log::Debug("Allocated device_features_ addr=%p sz=%lu", device_features_[device_id], num_gpu_feature_groups * num_data_); // allocate space for gradients and hessians on device @@ -349,17 +347,17 @@ void CUDATreeLearner::AllocateGPUMemory() { CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id])); } - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), (size_t) allocated_num_data_ * sizeof(score_t))); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]), (size_t) allocated_num_data_ * sizeof(score_t))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), static_cast(allocated_num_data_ * sizeof(score_t)))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]), static_cast(allocated_num_data_ * sizeof(score_t)))); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), (size_t) num_gpu_feature_groups)); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), static_cast(num_gpu_feature_groups))); // copy indices to the device if (device_data_indices_[device_id] != NULL) { CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); } - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), (size_t) allocated_num_data_ * sizeof(data_size_t))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), static_cast(allocated_num_data_ * sizeof(data_size_t)))); CUDASUCCESS_OR_FATAL(cudaMemsetAsync(device_data_indices_[device_id], 0, allocated_num_data_ * sizeof(data_size_t), stream_[device_id])); Log::Debug("Memset device_data_indices_"); @@ -368,19 +366,19 @@ void CUDATreeLearner::AllocateGPUMemory() { // each work group generates a sub-histogram of dword_features_ features. if (!device_subhistograms_[device_id]) { // only initialize once here, as this will not need to change when ResetTrainingData() is called - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), (size_t) preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast(preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)))); Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]); } // create atomic counters for inter-group coordination CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id])); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), (size_t) num_gpu_feature_groups * sizeof(int))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), static_cast(num_gpu_feature_groups * sizeof(int)))); CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int), stream_[device_id])); // The output buffer is allocated to host directly, to overlap compute and data transfer CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id])); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), (size_t) num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_)); + CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), static_cast(num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_))); } } } @@ -399,7 +397,7 @@ void CUDATreeLearner::copyDenseFeature() { Log::Debug("Started copying dense features from CPU to GPU"); // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes) - size_t copied_feature = 0; + size_t copied_feature = 0; // set device info int device_id = 0; uint8_t* device_features = device_features_[device_id]; @@ -412,12 +410,12 @@ void CUDATreeLearner::copyDenseFeature() { dense_feature_group_map_.push_back(i); auto sizes_in_byte = train_data_->FeatureGroupSizesInByte(i); void* tmp_data = train_data_->FeatureGroupData(i); - Log::Debug("Started copying dense features from CPU to GPU - 2"); + Log::Debug("Started copying dense features from CPU to GPU - 2"); CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(&device_features[copied_feature * num_data_], tmp_data, sizes_in_byte, cudaMemcpyHostToDevice, stream_[device_id])); - Log::Debug("Started copying dense features from CPU to GPU - 3"); + Log::Debug("Started copying dense features from CPU to GPU - 3"); copied_feature++; // reset device info - if (copied_feature == (size_t) num_gpu_feature_groups_[device_id]) { + if (copied_feature == static_cast(num_gpu_feature_groups_[device_id])) { CUDASUCCESS_OR_FATAL(cudaEventRecord(features_future_[device_id], stream_[device_id])); device_id += 1; copied_feature = 0; @@ -434,24 +432,24 @@ void CUDATreeLearner::copyDenseFeature() { -// LGBM_CUDA: InitGPU w/ num_gpu +// InitGPU w/ num_gpu void CUDATreeLearner::InitGPU() { // Get the max bin size, used for selecting best GPU kernel max_num_bin_ = 0; - #if GPU_DEBUG >= 1 + #if CUDA_DEBUG >= 1 printf("bin_size: "); #endif for (int i = 0; i < num_feature_groups_; ++i) { if (train_data_->IsMultiGroup(i)) { continue; } - #if GPU_DEBUG >= 1 + #if CUDA_DEBUG >= 1 printf("%d, ", train_data_->FeatureGroupNumBin(i)); #endif max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i)); } - #if GPU_DEBUG >= 1 + #if CUDA_DEBUG >= 1 printf("\n"); #endif @@ -478,21 +476,24 @@ void CUDATreeLearner::InitGPU() { Log::Warning("Setting max_bin to 15 is sugguested for best performance"); } - // LGBM_CUDA: get num_dense_feature_groups_ + // get num_dense_feature_groups_ CountDenseFeatureGroups(); - // LGBM_CUDA: initialize GPU + // initialize GPU CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_gpu_)); - if (num_gpu_ > 1) num_gpu_ = 1; + if (num_gpu_ > 1) { + Log::Warning("CUDA doesn't support more than one GPU currently."); + num_gpu_ = 1; + } if (num_gpu_ > num_dense_feature_groups_) num_gpu_ = num_dense_feature_groups_; - // LGBM_CUDA: set cpu threads - cpu_threads_ = reinterpret_cast(malloc(sizeof(pthread_t *)*num_gpu_)); + // set cpu threads + cpu_threads_ = reinterpret_cast(_mm_malloc(sizeof(pthread_t *)*num_gpu_, 16)); for (int device_id = 0; device_id < num_gpu_; ++device_id) { - cpu_threads_[device_id] = reinterpret_cast(malloc(sizeof(pthread_t))); + cpu_threads_[device_id] = reinterpret_cast(_mm_malloc(sizeof(pthread_t), 16)); } - // LGBM_CUDA: resize device memory pointers + // resize device memory pointers device_features_.resize(num_gpu_); device_gradients_.resize(num_gpu_); device_hessians_.resize(num_gpu_); @@ -502,7 +503,7 @@ void CUDATreeLearner::InitGPU() { device_subhistograms_.resize(num_gpu_); device_histogram_outputs_.resize(num_gpu_); - // LGBM_CUDA: create stream & events to handle multiple GPUs + // create stream & events to handle multiple GPUs preallocd_max_num_wg_.resize(num_gpu_, 1024); stream_.resize(num_gpu_); hessians_future_.resize(num_gpu_); @@ -539,7 +540,7 @@ Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians) } void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) { - // LGBM_CUDA: check data size + // check data size data_size_t old_allocated_num_data = allocated_num_data_; SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin); @@ -555,7 +556,7 @@ void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_ auto start_alloc_gpu_time = std::chrono::steady_clock::now(); #endif - // LGBM_CUDA: AllocateGPUMemory only when the number of data increased + // AllocateGPUMemory only when the number of data increased int old_num_feature_groups = num_dense_feature_groups_; CountDenseFeatureGroups(); if ((old_allocated_num_data < (num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature))) || (old_num_feature_groups < num_dense_feature_groups_)) { @@ -594,7 +595,7 @@ void CUDATreeLearner::BeforeTrain() { SerialTreeLearner::BeforeTrain(); - #if GPU_DEBUG >= 2 + #if CUDA_DEBUG >= 2 printf("CUDATreeLearner::BeforeTrain() Copying initial full gradients and hessians to device\n"); #endif @@ -750,7 +751,7 @@ bool CUDATreeLearner::ConstructGPUHistogramsAsync( // if not all feature groups are used, we need to transfer the feature mask to GPU // otherwise, we will use a specialized GPU kernel with all feature groups enabled - // LGBM_CUDA We now copy even if all features are used. + // We now copy even if all features are used. #pragma omp parallel for schedule(static, num_gpu_) for (int device_id = 0; device_id < num_gpu_; ++device_id) { int offset = offset_gpu_feature_groups_[device_id]; @@ -819,8 +820,8 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ } // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem - // #define GPU_DEBUG_COMPARE -#ifdef GPU_DEBUG_COMPARE + // #define CUDA_DEBUG_COMPARE +#ifdef CUDA_DEBUG_COMPARE printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n", num_dense_feature_groups_); bool compare = true; for (int i = 0; i < num_dense_feature_groups_; ++i) { @@ -927,7 +928,7 @@ void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_ void CUDATreeLearner::FindBestSplits(const Tree* tree) { SerialTreeLearner::FindBestSplits(tree); -#if GPU_DEBUG >= 3 +#if CUDA_DEBUG >= 3 for (int feature_index = 0; feature_index < num_features_; ++feature_index) { if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue; if (parent_leaf_histogram_array_ != nullptr @@ -948,7 +949,7 @@ void CUDATreeLearner::FindBestSplits(const Tree* tree) { void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) { const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf]; -#if GPU_DEBUG >= 2 +#if CUDA_DEBUG >= 2 printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian); #endif SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf); @@ -957,12 +958,12 @@ void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* righ if (best_split_info.left_count < best_split_info.right_count) { if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) || (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) { - Log::Fatal("1 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); + Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); } } else { if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) || (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) { - Log::Fatal("2 Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); + Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); } } } diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index e8bc9d331f7..009e2471bb4 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -6,20 +6,22 @@ #ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ #define LIGHTGBM_TREELEARNET_CUDA_TREE_LEARNER_H_ -#include -#include -#include +#include +#include +#include +#include +#include + +#include #include +#include #include +#include +#include #ifdef USE_CUDA #include #endif -#include -#include -#include -#include -#include #include "feature_histogram.hpp" #include "serial_tree_learner.h" #include "data_partition.hpp" @@ -28,7 +30,7 @@ #ifdef USE_CUDA #include -#include "cuda_kernel_launcher.h" // LGBM_CUDA +#include "cuda_kernel_launcher.h" using json11::Json; @@ -75,24 +77,24 @@ class CUDATreeLearner: public SerialTreeLearner { /*! * \brief Initialize GPU device - * \LGBM_CUDA: param num_gpu: number of maximum gpus + * \param num_gpu: number of maximum gpus */ void InitGPU(); /*! - * \brief Allocate memory for GPU computation // LGBM_CUDA: alloc only + * \brief Allocate memory for GPU computation // alloc only */ void CountDenseFeatureGroups(); // compute num_dense_feature_group void prevAllocateGPUMemory(); // compute CPU-side param calculation & Pin HostMemory void AllocateGPUMemory(); /*! - * \ LGBM_CUDA: ResetGPUMemory + * \ ResetGPUMemory */ void ResetGPUMemory(); /*! - * \ LGBM_CUDA: copy dense feature from CPU to GPU + * \ copy dense feature from CPU to GPU */ void copyDenseFeature(); @@ -160,7 +162,6 @@ class CUDATreeLearner: public SerialTreeLearner { * Set hessians to nullptr to skip copy to GPU. * \return true if GPU kernel is launched, false if GPU is not used */ - // LGBM_CUDA v5.2 bool ConstructGPUHistogramsAsync( const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data); @@ -181,8 +182,8 @@ class CUDATreeLearner: public SerialTreeLearner { int num_feature_groups_; /*! \brief total number of dense feature-groups, which will be processed on GPU */ int num_dense_feature_groups_; - std::vector num_gpu_feature_groups_; // LGBM_CUDA - std::vector offset_gpu_feature_groups_; // LGBM_CUDA + std::vector num_gpu_feature_groups_; + std::vector offset_gpu_feature_groups_; /*! \brief On GPU we read one DWORD (4-byte) of features of one example once. * With bin size > 16, there are 4 features per DWORD. * With bin size <=16, there are 8 features per DWORD. @@ -203,66 +204,48 @@ class CUDATreeLearner: public SerialTreeLearner { std::vector dense_feature_group_map_; /*! \brief Indices of all sparse feature-groups */ std::vector sparse_feature_group_map_; - /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */ - // std::vector device_bin_mults_; /*! \brief GPU memory object holding the training data */ - // uint8_t *device_features_; std::vector device_features_; /*! \brief GPU memory object holding the ordered gradient */ - // score_t *device_gradients_; std::vector device_gradients_; /*! \brief Pointer to pinned memory of ordered gradient */ void * ptr_pinned_gradients_ = nullptr; /*! \brief GPU memory object holding the ordered hessian */ - // score_t *device_hessians_; std::vector device_hessians_; /*! \brief Pointer to pinned memory of ordered hessian */ void * ptr_pinned_hessians_ = nullptr; /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */ - // std::vector> feature_masks_; std::vector feature_masks_; /*! \brief GPU memory object holding the feature masks */ - // void *device_feature_masks_; std::vector device_feature_masks_; /*! \brief Pointer to pinned memory of feature masks */ char* ptr_pinned_feature_masks_ = nullptr; /*! \brief GPU memory object holding indices of the leaf being processed */ - // data_size_t *device_data_indices_; std::vector device_data_indices_; /*! \brief GPU memory object holding counters for workgroup coordination */ - // int *sync_counters_; std::vector sync_counters_; /*! \brief GPU memory object holding temporary sub-histograms per workgroup */ - // char *device_subhistograms_; std::vector device_subhistograms_; /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */ - // void *device_histogram_outputs_; std::vector device_histogram_outputs_; /*! \brief Host memory pointer for histogram outputs */ void *host_histogram_outputs_; - /*! \LGBM_CUDA: CUDA waitlist object for waiting for data transfer before kernel execution */ + /*! CUDA waitlist object for waiting for data transfer before kernel execution */ // cudaEvent_t kernel_wait_obj_; std::vector kernel_wait_obj_; - /*! \LGBM_CUDA: CUDA waitlist object for reading output histograms after kernel execution */ - // cudaEvent_t histograms_wait_obj_; + /*! CUDA waitlist object for reading output histograms after kernel execution */ std::vector histograms_wait_obj_; - /*! \LGBM_CUDA: CUDA Asynchronous waiting object for copying indices */ - // cudaEvent_t indices_future_; + /*! CUDA Asynchronous waiting object for copying indices */ std::vector indices_future_; - /*! \LGBM_CUDA: Asynchronous waiting object for copying gradients */ - // cudaEvent_t gradients_future_; + /*! Asynchronous waiting object for copying gradients */ std::vector gradients_future_; - /*! \LGBM_CUDA: Asynchronous waiting object for copying hessians */ - // cudaEvent_t hessians_future_; + /*! Asynchronous waiting object for copying hessians */ std::vector hessians_future_; - // LGBM_CUDA:\brief Asynchronous waiting object for copying dense features - // cudaEvent_t features_future_; + /*! Asynchronous waiting object for copying dense features */ std::vector features_future_; - // LGBM_CUDA: host-side buffer for converting feature data into featre4 data - // std::vector host_vecs_; + // host-side buffer for converting feature data into featre4 data int nthreads_; // number of Feature4* vector on host4_vecs_ - // cudaEvent_t kernel_start_; // event for kernel start std::vector kernel_start_; std::vector kernel_time_; // measure histogram kernel time std::vector> kernel_input_wait_time_; diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 5c7cfdb4a9e..7ee72e5cda2 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -27,8 +27,7 @@ inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) { #ifdef IGNORE_INDICES #define KERNEL_NAME histogram16_fulldata #else // IGNORE_INDICES -#define KERNEL_NAME histogram16 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled -// #define KERNEL_NAME histogram16_allfeats +#define KERNEL_NAME histogram16 #endif // IGNORE_INDICES #else // ENABLE_ALL_FEATURES #error "ENABLE_ALL_FEATURES should always be 1" @@ -132,7 +131,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } __syncthreads(); // gradient/hessian histograms - // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? + // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary?? // total size: 2 * 256 * size_of(float) = 2 KB // organization: each feature/grad/hessian is at a different bank, // as indepedent of the feature value as possible @@ -174,7 +173,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, if (!feature_mask) { return; } else { - feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + feature_mask = feature_mask - 1; // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) } // STAGE 1: read feature data, and gradient and hessian @@ -182,7 +181,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // We will prefetch data into the "next" variable at the beginning of each iteration uchar feature; uchar feature_next; - // uint8_t bin; uint16_t bin; feature = feature_data[ind >> feature_mask]; @@ -196,7 +194,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // store gradient and hessian score_t grad, hess; score_t grad_next, hess_next; - // LGBM_CUDA v5.1 grad = ordered_gradients[ind]; #if CONST_HESSIAN == 0 hess = ordered_hessians[ind]; @@ -214,7 +211,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ind_next = data_indices[i_next]; #endif - // imbGBT v5.1 grad_next = ordered_gradients[ind_next]; #if CONST_HESSIAN == 0 hess_next = ordered_hessians[ind_next]; @@ -252,7 +248,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // STAGE 4: update next stat grad = grad_next; hess = hess_next; - // LGBM_CUDA: v4.2 if (!feature_mask) { feature = feature_next; } else { @@ -278,7 +273,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #if CONST_HESSIAN == 1 // make a final reduction gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // counter move to this position __syncthreads(); #endif @@ -308,7 +303,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } // make sure everyone in this workgroup is here __syncthreads(); - // everyone in this wrokgroup: if we are the last workgroup, then do reduction! + // everyone in this workgroup: if we are the last workgroup, then do reduction! if (*counter_val == (1 << power_feature_workgroups) - 1) { if (ltid == 0) { sync_counters[feature_id] = 0; @@ -318,7 +313,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } // only 1 work group, no need to increase counter // the reduction will become a simple copy - if (1) { + { unsigned int old_val; // dummy #endif // locate our feature's block in output memory @@ -450,7 +445,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } __syncthreads(); // gradient/hessian histograms - // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? + // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary?? // total size: 2 * 256 * size_of(float) = 2 KB // organization: each feature/grad/hessian is at a different bank, // as indepedent of the feature value as possible @@ -491,7 +486,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, if (!feature_mask) { return; } else { - feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + feature_mask = feature_mask - 1; // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) } // STAGE 1: read feature data, and gradient and hessian @@ -499,7 +494,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // We will prefetch data into the "next" variable at the beginning of each iteration uchar feature; uchar feature_next; - // uint8_t bin; uint16_t bin; feature = feature_data[ind >> feature_mask]; @@ -513,7 +507,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // store gradient and hessian score_t grad, hess; score_t grad_next, hess_next; - // LGBM_CUDA v5.1 grad = ordered_gradients[ind]; #if CONST_HESSIAN == 0 hess = ordered_hessians[ind]; @@ -531,7 +524,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ind_next = data_indices[i_next]; #endif - // imbGBT v5.1 grad_next = ordered_gradients[ind_next]; #if CONST_HESSIAN == 0 hess_next = ordered_hessians[ind_next]; @@ -569,7 +561,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // STAGE 4: update next stat grad = grad_next; hess = hess_next; - // LGBM_CUDA: v4.2 if (!feature_mask) { feature = feature_next; } else { @@ -594,7 +585,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #if CONST_HESSIAN == 1 // make a final reduction gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // counter move to this position __syncthreads(); #endif @@ -624,7 +615,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } // make sure everyone in this workgroup is here __syncthreads(); - // everyone in this wrokgroup: if we are the last workgroup, then do reduction! + // everyone in this workgroup: if we are the last workgroup, then do reduction! if (*counter_val == (1 << power_feature_workgroups) - 1) { if (ltid == 0) { sync_counters[feature_id] = 0; @@ -634,7 +625,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } // only 1 work group, no need to increase counter // the reduction will become a simple copy - if (1) { + { unsigned int old_val; // dummy #endif // locate our feature's block in output memory @@ -767,7 +758,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } __syncthreads(); // gradient/hessian histograms - // assume this starts at 32 * 4 = 128-byte boundary // LGBM_CUDA: What does it mean? boundary?? + // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary?? // total size: 2 * 256 * size_of(float) = 2 KB // organization: each feature/grad/hessian is at a different bank, // as indepedent of the feature value as possible @@ -808,7 +799,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, if (!feature_mask) { return; } else { - feature_mask = feature_mask - 1; // LGBM_CUDA: feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) + feature_mask = feature_mask - 1; // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) } // STAGE 1: read feature data, and gradient and hessian @@ -816,7 +807,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // We will prefetch data into the "next" variable at the beginning of each iteration uchar feature; uchar feature_next; - // uint8_t bin; uint16_t bin; feature = feature_data[ind >> feature_mask]; @@ -830,7 +820,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // store gradient and hessian score_t grad, hess; score_t grad_next, hess_next; - // LGBM_CUDA v5.1 grad = ordered_gradients[ind]; #if CONST_HESSIAN == 0 hess = ordered_hessians[ind]; @@ -848,7 +837,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ind_next = data_indices[i_next]; #endif - // imbGBT v5.1 grad_next = ordered_gradients[ind_next]; #if CONST_HESSIAN == 0 hess_next = ordered_hessians[ind_next]; @@ -885,7 +873,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, // STAGE 4: update next stat grad = grad_next; hess = hess_next; - // LGBM_CUDA: v4.2 if (!feature_mask) { feature = feature_next; } else { @@ -911,7 +898,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, #if CONST_HESSIAN == 1 // make a final reduction gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // LGBM_CUDA: counter move to this position + gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // counter move to this position __syncthreads(); #endif @@ -941,7 +928,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } // make sure everyone in this workgroup is here __syncthreads(); - // everyone in this wrokgroup: if we are the last workgroup, then do reduction! + // everyone in this workgroup: if we are the last workgroup, then do reduction! if (*counter_val == (1 << power_feature_workgroups) - 1) { if (ltid == 0) { sync_counters[feature_id] = 0; @@ -951,7 +938,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, } // only 1 work group, no need to increase counter // the reduction will become a simple copy - if (1) { + { unsigned int old_val; // dummy #endif // locate our feature's block in output memory diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 4bb62d203a9..2001f2e0dfe 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -12,9 +12,9 @@ #include #include +#include "cuda_tree_learner.h" #include "gpu_tree_learner.h" #include "serial_tree_learner.h" -#include "cuda_tree_learner.h" namespace LightGBM { diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index fc1de33e365..59ba770fb95 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -8,10 +8,10 @@ #include #include #include +#include #include #include #include -#include #include #include diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index 63ca1b2de83..ab009a0b100 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -4,8 +4,8 @@ */ #include -#include "gpu_tree_learner.h" #include "cuda_tree_learner.h" +#include "gpu_tree_learner.h" #include "parallel_tree_learner.h" #include "serial_tree_learner.h" From a751bea68c3b40fedef3397f4e6942c01679744a Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 27 Aug 2020 12:32:39 +0000 Subject: [PATCH 114/119] Replace warning with Log message. Removal of some of the USE_CUDA. Fix typo and removal of pragma once. --- include/LightGBM/cuda/vector_cudahost.h | 6 +++--- src/application/application.cpp | 2 -- src/boosting/gbdt.cpp | 4 ---- src/io/config.cpp | 2 -- src/io/dataset.cpp | 2 -- src/treelearner/cuda_tree_learner.h | 3 +-- 6 files changed, 4 insertions(+), 15 deletions(-) diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 60a82cc8391..0f2cf8081e5 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -33,8 +33,6 @@ class LGBM_config_ { static int current_learner; // Default: use_cpu_learner }; -} // namespace LightGBM - template struct CHAllocator { @@ -48,7 +46,7 @@ struct CHAllocator { if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) { cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); if (ret != cudaSuccess) { - fprintf(stderr, " TROUBLE: defaulting to malloc in CHAllocator!!!\n"); fflush(stderr); + Log::Warning("Defaulting to malloc in CHAllocator!!!"); ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); } } else { @@ -83,4 +81,6 @@ bool operator==(const CHAllocator&, const CHAllocator&); template bool operator!=(const CHAllocator&, const CHAllocator&); +} // namespace LightGBM + #endif // LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ diff --git a/src/application/application.cpp b/src/application/application.cpp index c62cdd711e0..43ba033881e 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -40,11 +40,9 @@ Application::Application(int argc, char** argv) { Log::Fatal("No training/prediction data, application quit"); } -#ifdef USE_CUDA if (config_.device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; } -#endif } Application::~Application() { diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 2c9fb3b734e..fcb7185a151 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -17,10 +17,8 @@ namespace LightGBM { -#ifdef USE_CUDA int LGBM_config_::current_device = lgbm_device_cpu; int LGBM_config_::current_learner = use_cpu_learner; -#endif GBDT::GBDT() : iter_(0), @@ -63,11 +61,9 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective es_first_metric_only_ = config_->first_metric_only; shrinkage_rate_ = config_->learning_rate; -#ifdef USE_CUDA if (config_->device_type == std::string("cuda")) { LGBM_config_::current_learner = use_cuda_learner; } -#endif // load forced_splits file if (!config->forcedsplits_filename.empty()) { diff --git a/src/io/config.cpp b/src/io/config.cpp index f0a9544e3b9..6e1872d0e76 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -209,11 +209,9 @@ void Config::Set(const std::unordered_map& params) { GetMetricType(params, &metric); GetObjectiveType(params, &objective); GetDeviceType(params, &device_type); -#ifdef USE_CUDA if (device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; } -#endif GetTreeLearnerType(params, &tree_learner); GetMembersFromString(params); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 2d9693bc695..90ba6a0eb58 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -337,7 +337,6 @@ void Dataset::Construct(std::vector>* bin_mappers, auto features_in_group = NoGroup(used_features); auto is_sparse = io_config.is_enable_sparse; -#ifdef USE_CUDA if (io_config.device_type == std::string("cuda")) { LightGBM::LGBM_config_::current_device = lgbm_device_cuda; if (is_sparse) { @@ -345,7 +344,6 @@ void Dataset::Construct(std::vector>* bin_mappers, } is_sparse = false; } -#endif std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 009e2471bb4..b1fae65f5d1 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -2,9 +2,8 @@ * Copyright (c) 2020 IBM Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#pragma once #ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ -#define LIGHTGBM_TREELEARNET_CUDA_TREE_LEARNER_H_ +#define LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ #include #include From 15eec67a3024786a59fbe6fea29c9283314aa81f Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 27 Aug 2020 13:03:13 +0000 Subject: [PATCH 115/119] Remove PRINT debug for CUDA code. --- src/treelearner/kernels/histogram_16_64_256.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index 7ee72e5cda2..ccb399f4ecb 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -12,11 +12,6 @@ namespace LightGBM { -#define PRINT(b, t, fmt, ...) \ -if (b == gtid && t == ltid) { \ - printf(fmt, __VA_ARGS__); \ -} - // atomic add for float number in local memory inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) { atomicAdd(addr, static_cast(val)); From 1884dc20d31fbacf263aba3d6002d7d0f1974d0f Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 31 Aug 2020 11:41:48 +0000 Subject: [PATCH 116/119] Allow to use of multiple GPUs for CUDA. --- src/treelearner/cuda_tree_learner.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 813f99d4ba8..12aa722e1c9 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -481,10 +481,6 @@ void CUDATreeLearner::InitGPU() { // initialize GPU CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_gpu_)); - if (num_gpu_ > 1) { - Log::Warning("CUDA doesn't support more than one GPU currently."); - num_gpu_ = 1; - } if (num_gpu_ > num_dense_feature_groups_) num_gpu_ = num_dense_feature_groups_; // set cpu threads From 32f3a8dcea5efcc9e009918db1d5c75392e254ae Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 2 Sep 2020 13:52:41 +0000 Subject: [PATCH 117/119] More multi-GPUs enablement for CUDA. --- docs/Parameters.rst | 4 ++++ include/LightGBM/config.h | 4 ++++ src/io/config_auto.cpp | 5 +++++ src/treelearner/cuda_tree_learner.cpp | 12 ++++++++---- src/treelearner/cuda_tree_learner.h | 2 +- 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 14d7a8098cf..4cbfed49104 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -1122,6 +1122,10 @@ GPU Parameters - set this to ``true`` to use double precision math on GPU (by default single precision is used) +- ``num_gpu`` :raw-html:`🔗︎`, default = ``1``, type = int, constraints: ``num_gpu > 0`` + + - number of gpus (CUDA implementation only) + .. end params list Others diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index bfcb09a4004..25447abaf1a 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -968,6 +968,10 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) bool gpu_use_dp = false; + // check = >0 + // desc = number of gpus (CUDA implementation only) + int num_gpu = 1; + #pragma endregion #pragma endregion diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index b14af67fd30..ad102020322 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -296,6 +296,7 @@ const std::unordered_set& Config::parameter_set() { "gpu_platform_id", "gpu_device_id", "gpu_use_dp", + "num_gpu", }); return params; } @@ -611,6 +612,9 @@ void Config::GetMembersFromString(const std::unordered_mapnum_feature_groups(); // Initialize GPU buffers and kernels: get device info - InitGPU(); + InitGPU(config_->num_gpu); } // some functions used for debugging the GPU histogram construction @@ -433,7 +433,7 @@ void CUDATreeLearner::copyDenseFeature() { // InitGPU w/ num_gpu -void CUDATreeLearner::InitGPU() { +void CUDATreeLearner::InitGPU(int num_gpu) { // Get the max bin size, used for selecting best GPU kernel max_num_bin_ = 0; @@ -479,9 +479,13 @@ void CUDATreeLearner::InitGPU() { // get num_dense_feature_groups_ CountDenseFeatureGroups(); + if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_; + // initialize GPU - CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_gpu_)); - if (num_gpu_ > num_dense_feature_groups_) num_gpu_ = num_dense_feature_groups_; + int gpu_count; + + CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count)); + num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu; // set cpu threads cpu_threads_ = reinterpret_cast(_mm_malloc(sizeof(pthread_t *)*num_gpu_, 16)); diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index b1fae65f5d1..17c7d13d8a0 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -78,7 +78,7 @@ class CUDATreeLearner: public SerialTreeLearner { * \brief Initialize GPU device * \param num_gpu: number of maximum gpus */ - void InitGPU(); + void InitGPU(int num_gpu); /*! * \brief Allocate memory for GPU computation // alloc only From ea537f88e1c2f0f3245c8fc31e34ae62e2621c7e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 14 Sep 2020 12:39:58 +0000 Subject: [PATCH 118/119] More code cleanup based on reviews comments. --- CMakeLists.txt | 4 ++-- include/LightGBM/config.h | 5 +++-- include/LightGBM/cuda/cuda_utils.h | 2 +- include/LightGBM/cuda/vector_cudahost.h | 4 ++-- src/application/application.cpp | 2 +- src/io/config.cpp | 2 +- src/io/dataset.cpp | 2 +- src/treelearner/cuda_tree_learner.cpp | 4 ++-- src/treelearner/cuda_tree_learner.h | 10 ---------- src/treelearner/kernels/histogram_16_64_256.cu | 6 ------ 10 files changed, 13 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 79870e8c54d..b2e206fe5fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,8 +168,8 @@ if(USE_CUDA) -DIGNORE_INDICES ) - message(STATUS ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}) - message(STATUS FULLDATA_DEFINES: ${FULLDATA_DEFINES}) + message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}") + message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}") function(add_histogram hsize hname hadd hconst hdir) add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 25447abaf1a..5e190261390 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -965,11 +965,12 @@ struct Config { // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details int gpu_device_id = -1; - // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) + // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used in OpenCL implementation and double precision is used in CUDA implementation) bool gpu_use_dp = false; // check = >0 - // desc = number of gpus (CUDA implementation only) + // desc = number of GPUs + // desc = **Note**: can be used only in CUDA implementation int num_gpu = 1; #pragma endregion diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index b94b12d1c92..1054e09daf1 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -14,7 +14,7 @@ #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { - LightGBM::Log::Fatal("CUDA_RUNTIME: %s %s %d\n", cudaGetErrorString(code), file, line); + LightGBM::Log::Fatal("[CUDA] %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 0f2cf8081e5..f81cc4dd905 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -43,7 +43,7 @@ struct CHAllocator { T* ptr; if (n == 0) return NULL; #ifdef USE_CUDA - if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) { + if (LGBM_config_::current_device == lgbm_device_cuda) { cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); if (ret != cudaSuccess) { Log::Warning("Defaulting to malloc in CHAllocator!!!"); @@ -62,7 +62,7 @@ struct CHAllocator { (void)n; // UNUSED if (p == NULL) return; #ifdef USE_CUDA - if (LightGBM::LGBM_config_::current_device == lgbm_device_cuda) { + if (LGBM_config_::current_device == lgbm_device_cuda) { cudaPointerAttributes attributes; cudaPointerGetAttributes(&attributes, p); if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { diff --git a/src/application/application.cpp b/src/application/application.cpp index 43ba033881e..d9be76d67c9 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -41,7 +41,7 @@ Application::Application(int argc, char** argv) { } if (config_.device_type == std::string("cuda")) { - LightGBM::LGBM_config_::current_device = lgbm_device_cuda; + LGBM_config_::current_device = lgbm_device_cuda; } } diff --git a/src/io/config.cpp b/src/io/config.cpp index 6e1872d0e76..6878896deb5 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -210,7 +210,7 @@ void Config::Set(const std::unordered_map& params) { GetObjectiveType(params, &objective); GetDeviceType(params, &device_type); if (device_type == std::string("cuda")) { - LightGBM::LGBM_config_::current_device = lgbm_device_cuda; + LGBM_config_::current_device = lgbm_device_cuda; } GetTreeLearnerType(params, &tree_learner); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 90ba6a0eb58..fce7cfa2bb2 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -338,7 +338,7 @@ void Dataset::Construct(std::vector>* bin_mappers, auto is_sparse = io_config.is_enable_sparse; if (io_config.device_type == std::string("cuda")) { - LightGBM::LGBM_config_::current_device = lgbm_device_cuda; + LGBM_config_::current_device = lgbm_device_cuda; if (is_sparse) { Log::Warning("Using sparse features with CUDA is currently not supported."); } diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 314494f4ef3..16569eef257 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -309,7 +309,7 @@ void CUDATreeLearner::prevAllocateGPUMemory() { // histogram bin entry size depends on the precision (single/double) hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t)); // two elements in this "size" - CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast(&host_histogram_outputs_), (size_t)(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable)); + CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast(&host_histogram_outputs_), static_cast(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable)); nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_); nthreads_ = std::max(nthreads_, 1); @@ -485,7 +485,7 @@ void CUDATreeLearner::InitGPU(int num_gpu) { int gpu_count; CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count)); - num_gpu_ = (gpu_count < num_gpu)? gpu_count : num_gpu; + num_gpu_ = (gpu_count < num_gpu) ? gpu_count : num_gpu; // set cpu threads cpu_threads_ = reinterpret_cast(_mm_malloc(sizeof(pthread_t *)*num_gpu_, 16)); diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h index 17c7d13d8a0..442c2f53ea0 100644 --- a/src/treelearner/cuda_tree_learner.h +++ b/src/treelearner/cuda_tree_learner.h @@ -153,12 +153,6 @@ class CUDATreeLearner: public SerialTreeLearner { * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU. * Set to nullptr to skip copy to GPU. * \param num_data Number of data examples to be included in histogram - * \param gradients Array of gradients for all examples. - * \param hessians Array of hessians for all examples. - * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, - * Set gradients to nullptr to skip copy to GPU. - * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, - * Set hessians to nullptr to skip copy to GPU. * \return true if GPU kernel is launched, false if GPU is not used */ bool ConstructGPUHistogramsAsync( @@ -188,9 +182,6 @@ class CUDATreeLearner: public SerialTreeLearner { * With bin size <=16, there are 8 features per DWORD. */ int dword_features_; - /*! \brief total number of dense feature-group tuples on GPU. - * Each feature tuple is 4-byte (4 features if each feature takes a byte) */ - // int num_dense_feature4_; /*! \brief Max number of bins of training data, used to determine * which GPU kernel to use */ int max_num_bin_; @@ -230,7 +221,6 @@ class CUDATreeLearner: public SerialTreeLearner { /*! \brief Host memory pointer for histogram outputs */ void *host_histogram_outputs_; /*! CUDA waitlist object for waiting for data transfer before kernel execution */ - // cudaEvent_t kernel_wait_obj_; std::vector kernel_wait_obj_; /*! CUDA waitlist object for reading output histograms after kernel execution */ std::vector histograms_wait_obj_; diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu index ccb399f4ecb..105ccbb6203 100644 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ b/src/treelearner/kernels/histogram_16_64_256.cu @@ -303,7 +303,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, if (ltid == 0) { sync_counters[feature_id] = 0; } - // } #else } // only 1 work group, no need to increase counter @@ -316,7 +315,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type const * __restrict__ feature_subhists = reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory - // unsigned int skip_id = feature_id ^ output_offset; unsigned int skip_id = group_id - output_offset; // locate output histogram location for this feature4 acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; @@ -615,7 +613,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, if (ltid == 0) { sync_counters[feature_id] = 0; } - // } #else } // only 1 work group, no need to increase counter @@ -628,7 +625,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type const * __restrict__ feature_subhists = reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory - // unsigned int skip_id = feature_id ^ output_offset; unsigned int skip_id = group_id - output_offset; // locate output histogram location for this feature4 acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; @@ -928,7 +924,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, if (ltid == 0) { sync_counters[feature_id] = 0; } - // } #else } // only 1 work group, no need to increase counter @@ -941,7 +936,6 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, acc_type const * __restrict__ feature_subhists = reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; // skip reading the data already in local memory - // unsigned int skip_id = feature_id ^ output_offset; unsigned int skip_id = group_id - output_offset; // locate output histogram location for this feature4 acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; From d9e9d2e0df9f75aed79af0b88bf24a9300479dd6 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 14 Sep 2020 13:54:26 +0000 Subject: [PATCH 119/119] Update docs with latest config changes. --- docs/Parameters.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 4cbfed49104..dcd1353e152 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -1120,11 +1120,13 @@ GPU Parameters - ``gpu_use_dp`` :raw-html:`🔗︎`, default = ``false``, type = bool - - set this to ``true`` to use double precision math on GPU (by default single precision is used) + - set this to ``true`` to use double precision math on GPU (by default single precision is used in OpenCL implementation and double precision is used in CUDA implementation) - ``num_gpu`` :raw-html:`🔗︎`, default = ``1``, type = int, constraints: ``num_gpu > 0`` - - number of gpus (CUDA implementation only) + - number of GPUs + + - **Note**: can be used only in CUDA implementation .. end params list