From b4b97e0e387f1e23cd01d77a99e6933607c436a6 Mon Sep 17 00:00:00 2001 From: fis Date: Sun, 4 Jul 2021 15:51:00 +0800 Subject: [PATCH] Remove lz4 compression with external memory. --- .github/workflows/main.yml | 4 +- CMakeLists.txt | 4 +- Jenkinsfile | 2 +- plugin/CMakeLists.txt | 10 - plugin/lz4/sparse_page_lz4_format.cc | 341 --------------------------- python-package/setup.py | 2 - tests/ci_build/Dockerfile.cpu | 2 +- tests/ci_build/Dockerfile.gpu_build | 2 +- tests/travis/setup.sh | 2 +- 9 files changed, 9 insertions(+), 360 deletions(-) delete mode 100644 plugin/lz4/sparse_page_lz4_format.cc diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 81989e2ee2b4..15d8fafc3b14 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,12 +21,12 @@ jobs: submodules: 'true' - name: Install system packages run: | - brew install lz4 ninja libomp + brew install ninja libomp - name: Build gtest binary run: | mkdir build cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_LZ4=ON -DPLUGIN_DENSE_PARSER=ON -GNinja + cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_DENSE_PARSER=ON -GNinja ninja -v - name: Run gtest binary run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f9a80bc463a..c63b83cf37cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,7 +62,6 @@ set(ENABLED_SANITIZERS "address" "leak" CACHE STRING "Semicolon separated list of sanitizer names. E.g 'address;leak'. Supported sanitizers are address, leak, undefined and thread.") ## Plugins -option(PLUGIN_LZ4 "Build lz4 plugin" OFF) option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF) option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF) ## TODO: 1. Add check if DPC++ compiler is used for building @@ -92,6 +91,9 @@ endif (R_LIB AND GOOGLE_TEST) if (USE_AVX) message(SEND_ERROR "The option 'USE_AVX' is deprecated as experimental AVX features have been removed from XGBoost.") endif (USE_AVX) +if (PLUGIN_LZ4) + message(SEND_ERROR "The option 'PLUGIN_LZ4' is removed from XGBoost.") +endif (PLUGIN_LZ4) if (PLUGIN_RMM AND NOT (USE_CUDA)) message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.") endif (PLUGIN_RMM AND NOT (USE_CUDA)) diff --git a/Jenkinsfile b/Jenkinsfile index b623f073b80e..2ec0d8a50535 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -147,7 +147,7 @@ def BuildCPU() { # This step is not necessary, but here we include it, to ensure that DMLC_CORE_USE_CMAKE flag is correctly propagated # We want to make sure that we use the configured header build/dmlc/build_config.h instead of include/dmlc/build_config_default.h. # See discussion at https://github.com/dmlc/xgboost/issues/5510 - ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DPLUGIN_LZ4=ON -DPLUGIN_DENSE_PARSER=ON + ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON ${dockerRun} ${container_type} ${docker_binary} bash -c "cd build && ctest --extra-verbose" """ // Sanitizer test diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt index 5e1b412885d8..97d1af190cfe 100644 --- a/plugin/CMakeLists.txt +++ b/plugin/CMakeLists.txt @@ -1,13 +1,3 @@ -if (PLUGIN_LZ4) - target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/lz4/sparse_page_lz4_format.cc) - find_path(LIBLZ4_INCLUDE_DIR lz4.h) - find_library(LIBLZ4_LIBRARY NAMES lz4) - message(STATUS "LIBLZ4_INCLUDE_DIR = ${LIBLZ4_INCLUDE_DIR}") - message(STATUS "LIBLZ4_LIBRARY = ${LIBLZ4_LIBRARY}") - target_include_directories(objxgboost PUBLIC ${LIBLZ4_INCLUDE_DIR}) - target_link_libraries(objxgboost PUBLIC ${LIBLZ4_LIBRARY}) -endif (PLUGIN_LZ4) - if (PLUGIN_DENSE_PARSER) target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/dense_parser/dense_libsvm.cc) endif (PLUGIN_DENSE_PARSER) diff --git a/plugin/lz4/sparse_page_lz4_format.cc b/plugin/lz4/sparse_page_lz4_format.cc deleted file mode 100644 index 139cb446a702..000000000000 --- a/plugin/lz4/sparse_page_lz4_format.cc +++ /dev/null @@ -1,341 +0,0 @@ -/*! - * Copyright (c) 2015 by Contributors - * \file sparse_page_lz4_format.cc - * XGBoost Plugin to enable LZ4 compressed format on the external memory pages. - */ -#include - -#include -#include -#include -#include -#include -#include "../../src/data/sparse_page_writer.h" - -namespace xgboost { -namespace data { - -DMLC_REGISTRY_FILE_TAG(sparse_page_lz4_format); - -// array to help compression of decompression. -template -class CompressArray { - public: - // the data content. - std::vector data; - // Decompression helper - // number of chunks - inline int num_chunk() const { - CHECK_GT(raw_chunks_.size(), 1); - return static_cast(raw_chunks_.size() - 1); - } - // raw bytes - inline size_t RawBytes() const { - return raw_chunks_.back() * sizeof(DType); - } - // encoded bytes - inline size_t EncodedBytes() const { - return encoded_chunks_.back() + - (encoded_chunks_.size() + raw_chunks_.size()) * sizeof(bst_uint); - } - // load the array from file. - inline void Read(dmlc::SeekStream* fi); - // run decode on chunk_id - inline void Decompress(int chunk_id); - // Compression helper - // initialize the compression chunks - inline void InitCompressChunks(const std::vector& chunk_ptr); - // initialize the compression chunks - inline void InitCompressChunks(size_t chunk_size, size_t max_nchunk); - // run decode on chunk_id, level = -1 means default. - inline void Compress(int chunk_id, bool use_hc); - // save the output buffer into file. - inline void Write(dmlc::Stream* fo); - - private: - // the chunk split of the data, by number of elements - std::vector raw_chunks_; - // the encoded chunk, by number of bytes - std::vector encoded_chunks_; - // output buffer of compression. - std::vector out_buffer_; - // input buffer of data. - std::string in_buffer_; -}; - -template -inline void CompressArray::Read(dmlc::SeekStream* fi) { - CHECK(fi->Read(&raw_chunks_)); - CHECK(fi->Read(&encoded_chunks_)); - size_t buffer_size = encoded_chunks_.back(); - in_buffer_.resize(buffer_size); - CHECK_EQ(fi->Read(dmlc::BeginPtr(in_buffer_), buffer_size), buffer_size); - data.resize(raw_chunks_.back()); -} - -template -inline void CompressArray::Decompress(int chunk_id) { - int chunk_size = static_cast( - raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType); - int encoded_size = static_cast( - encoded_chunks_[chunk_id + 1] - encoded_chunks_[chunk_id]); - // decompress data - int src_size = LZ4_decompress_fast( - dmlc::BeginPtr(in_buffer_) + encoded_chunks_[chunk_id], - reinterpret_cast(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]), - chunk_size); - CHECK_EQ(encoded_size, src_size); -} - -template -inline void CompressArray::InitCompressChunks( - const std::vector& chunk_ptr) { - raw_chunks_ = chunk_ptr; - CHECK_GE(raw_chunks_.size(), 2); - out_buffer_.resize(raw_chunks_.size() - 1); - for (size_t i = 0; i < out_buffer_.size(); ++i) { - out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]); - } -} - -template -inline void CompressArray::InitCompressChunks(size_t chunk_size, size_t max_nchunk) { - raw_chunks_.clear(); - raw_chunks_.push_back(0); - size_t min_chunk_size = data.size() / max_nchunk; - chunk_size = std::max(min_chunk_size, chunk_size); - size_t nstep = data.size() / chunk_size; - for (size_t i = 0; i < nstep; ++i) { - raw_chunks_.push_back(raw_chunks_.back() + chunk_size); - CHECK_LE(raw_chunks_.back(), data.size()); - } - if (nstep == 0) raw_chunks_.push_back(0); - raw_chunks_.back() = data.size(); - CHECK_GE(raw_chunks_.size(), 2); - out_buffer_.resize(raw_chunks_.size() - 1); - for (size_t i = 0; i < out_buffer_.size(); ++i) { - out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]); - } -} - -template -inline void CompressArray::Compress(int chunk_id, bool use_hc) { - CHECK_LT(static_cast(chunk_id + 1), raw_chunks_.size()); - std::string& buf = out_buffer_[chunk_id]; - size_t raw_chunk_size = (raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType); - int bound = LZ4_compressBound(raw_chunk_size); - CHECK_NE(bound, 0); - buf.resize(bound); - int encoded_size; - if (use_hc) { - encoded_size = LZ4_compress_HC( - reinterpret_cast(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]), - dmlc::BeginPtr(buf), raw_chunk_size, buf.length(), 9); - } else { - encoded_size = LZ4_compress_default( - reinterpret_cast(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]), - dmlc::BeginPtr(buf), raw_chunk_size, buf.length()); - } - CHECK_NE(encoded_size, 0); - CHECK_LE(static_cast(encoded_size), buf.length()); - buf.resize(encoded_size); -} - -template -inline void CompressArray::Write(dmlc::Stream* fo) { - encoded_chunks_.clear(); - encoded_chunks_.push_back(0); - for (size_t i = 0; i < out_buffer_.size(); ++i) { - encoded_chunks_.push_back(encoded_chunks_.back() + out_buffer_[i].length()); - } - fo->Write(raw_chunks_); - fo->Write(encoded_chunks_); - for (const std::string& buf : out_buffer_) { - fo->Write(dmlc::BeginPtr(buf), buf.length()); - } -} - -template -class SparsePageLZ4Format : public SparsePageFormat { - public: - explicit SparsePageLZ4Format(bool use_lz4_hc) - : use_lz4_hc_(use_lz4_hc) { - raw_bytes_ = raw_bytes_value_ = raw_bytes_index_ = 0; - encoded_bytes_value_ = encoded_bytes_index_ = 0; - nthread_ = dmlc::GetEnv("XGBOOST_LZ4_DECODE_NTHREAD", 4); - nthread_write_ = dmlc::GetEnv("XGBOOST_LZ4_COMPRESS_NTHREAD", 12); - } - virtual ~SparsePageLZ4Format() { - size_t encoded_bytes = raw_bytes_ + encoded_bytes_value_ + encoded_bytes_index_; - raw_bytes_ += raw_bytes_value_ + raw_bytes_index_; - if (raw_bytes_ != 0) { - LOG(CONSOLE) << "raw_bytes=" << raw_bytes_ - << ", encoded_bytes=" << encoded_bytes - << ", ratio=" << double(encoded_bytes) / raw_bytes_ - << ", ratio-index=" << double(encoded_bytes_index_) /raw_bytes_index_ - << ", ratio-value=" << double(encoded_bytes_value_) /raw_bytes_value_; - } - } - - bool Read(SparsePage* page, dmlc::SeekStream* fi) override { - auto& offset_vec = page->offset.HostVector(); - auto& data_vec = page->data.HostVector(); - if (!fi->Read(&(offset_vec))) return false; - CHECK_NE(offset_vec.size(), 0) << "Invalid SparsePage file"; - this->LoadIndexValue(fi); - - data_vec.resize(offset_vec.back()); - CHECK_EQ(index_.data.size(), value_.data.size()); - CHECK_EQ(index_.data.size(), data_vec.size()); - for (size_t i = 0; i < data_vec.size(); ++i) { - data_vec[i] = Entry(index_.data[i] + min_index_, value_.data[i]); - } - return true; - } - - bool Read(SparsePage* page, - dmlc::SeekStream* fi, - const std::vector& sorted_index_set) override { - if (!fi->Read(&disk_offset_)) return false; - this->LoadIndexValue(fi); - auto& offset_vec = page->offset.HostVector(); - auto& data_vec = page->data.HostVector(); - offset_vec.clear(); - offset_vec.push_back(0); - for (bst_uint cid : sorted_index_set) { - offset_vec.push_back( - offset_vec.back() + disk_offset_[cid + 1] - disk_offset_[cid]); - } - data_vec.resize(offset_vec.back()); - CHECK_EQ(index_.data.size(), value_.data.size()); - CHECK_EQ(index_.data.size(), disk_offset_.back()); - - for (size_t i = 0; i < sorted_index_set.size(); ++i) { - bst_uint cid = sorted_index_set[i]; - size_t dst_begin = offset_vec[i]; - size_t src_begin = disk_offset_[cid]; - size_t num = disk_offset_[cid + 1] - disk_offset_[cid]; - for (size_t j = 0; j < num; ++j) { - data_vec[dst_begin + j] = Entry( - index_.data[src_begin + j] + min_index_, value_.data[src_begin + j]); - } - } - return true; - } - - void Write(const SparsePage& page, dmlc::Stream* fo) override { - const auto& offset_vec = page.offset.HostVector(); - const auto& data_vec = page.data.HostVector(); - CHECK(offset_vec.size() != 0 && offset_vec[0] == 0); - CHECK_EQ(offset_vec.back(), data_vec.size()); - fo->Write(offset_vec); - min_index_ = page.base_rowid; - fo->Write(&min_index_, sizeof(min_index_)); - index_.data.resize(data_vec.size()); - value_.data.resize(data_vec.size()); - - for (size_t i = 0; i < data_vec.size(); ++i) { - bst_uint idx = data_vec[i].index - min_index_; - CHECK_LE(idx, static_cast(std::numeric_limits::max())) - << "The storage index is chosen to limited to smaller equal than " - << std::numeric_limits::max() - << "min_index=" << min_index_; - index_.data[i] = static_cast(idx); - value_.data[i] = data_vec[i].fvalue; - } - - index_.InitCompressChunks(kChunkSize, kMaxChunk); - value_.InitCompressChunks(kChunkSize, kMaxChunk); - - int nindex = index_.num_chunk(); - int nvalue = value_.num_chunk(); - int ntotal = nindex + nvalue; - dmlc::OMPException exc; - #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_write_) - for (int i = 0; i < ntotal; ++i) { - exc.Run([&]() { - if (i < nindex) { - index_.Compress(i, use_lz4_hc_); - } else { - value_.Compress(i - nindex, use_lz4_hc_); - } - }); - } - exc.Rethrow(); - index_.Write(fo); - value_.Write(fo); - // statistics - raw_bytes_index_ += index_.RawBytes() * sizeof(bst_uint) / sizeof(StorageIndex); - raw_bytes_value_ += value_.RawBytes(); - encoded_bytes_index_ += index_.EncodedBytes(); - encoded_bytes_value_ += value_.EncodedBytes(); - raw_bytes_ += offset_vec.size() * sizeof(size_t); - } - - inline void LoadIndexValue(dmlc::SeekStream* fi) { - fi->Read(&min_index_, sizeof(min_index_)); - index_.Read(fi); - value_.Read(fi); - - int nindex = index_.num_chunk(); - int nvalue = value_.num_chunk(); - int ntotal = nindex + nvalue; - dmlc::OMPException exc; - #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_) - for (int i = 0; i < ntotal; ++i) { - exc.Run([&]() { - if (i < nindex) { - index_.Decompress(i); - } else { - value_.Decompress(i - nindex); - } - }); - } - exc.Rethrow(); - } - - private: - // default chunk size. - static const size_t kChunkSize = 64 << 10UL; - // maximum chunk size. - static const size_t kMaxChunk = 128; - // bool whether use hc - bool use_lz4_hc_; - // number of threads - int nthread_; - // number of writing threads - int nthread_write_; - // raw bytes - size_t raw_bytes_, raw_bytes_index_, raw_bytes_value_; - // encoded bytes - size_t encoded_bytes_index_, encoded_bytes_value_; - /*! \brief minimum index value */ - uint32_t min_index_; - /*! \brief external memory column offset */ - std::vector disk_offset_; - // internal index - CompressArray index_; - // value set. - CompressArray value_; -}; - -XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4) -.describe("Apply LZ4 binary data compression for ext memory.") -.set_body([]() { - return new SparsePageLZ4Format(false); - }); - -XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4hc) -.describe("Apply LZ4 binary data compression(high compression ratio) for ext memory.") -.set_body([]() { - return new SparsePageLZ4Format(true); - }); - -XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4i16hc) -.describe("Apply LZ4 binary data compression(16 bit index mode) for ext memory.") -.set_body([]() { - return new SparsePageLZ4Format(true); - }); - -} // namespace data -} // namespace xgboost diff --git a/python-package/setup.py b/python-package/setup.py index a2feb76674da..a3fc2f3eb9f4 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -26,7 +26,6 @@ 'use-hdfs': (None, 'Build with HDFS support', 0), 'use-azure': (None, 'Build with AZURE support.', 0), 'use-s3': (None, 'Build with S3 support', 0), - 'plugin-lz4': (None, 'Build lz4 plugin.', 0), 'plugin-dense-parser': (None, 'Build dense parser plugin.', 0), # Python specific 'use-system-libxgboost': (None, 'Use libxgboost.so in system path.', 0) @@ -268,7 +267,6 @@ def initialize_options(self): self.use_azure = 0 self.use_s3 = 0 - self.plugin_lz4 = 0 self.plugin_dense_parser = 0 self.use_system_libxgboost = 0 diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index 411319b26fcd..7543668d64f5 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -7,7 +7,7 @@ SHELL ["/bin/bash", "-c"] # Use Bash as shell # Install all basic requirements RUN \ apt-get update && \ - apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 liblz4-dev ninja-build && \ + apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build && \ # CMake wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \ bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \ diff --git a/tests/ci_build/Dockerfile.gpu_build b/tests/ci_build/Dockerfile.gpu_build index 15b5675aa1ad..8f719d2531ed 100644 --- a/tests/ci_build/Dockerfile.gpu_build +++ b/tests/ci_build/Dockerfile.gpu_build @@ -9,7 +9,7 @@ SHELL ["/bin/bash", "-c"] # Use Bash as shell # Install all basic requirements RUN \ apt-get update && \ - apt-get install -y tar unzip wget bzip2 libgomp1 git build-essential doxygen graphviz llvm libasan2 libidn11 liblz4-dev ninja-build && \ + apt-get install -y tar unzip wget bzip2 libgomp1 git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build && \ # CMake wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \ bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \ diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh index 3ce285ed2eb8..42e942d39a20 100755 --- a/tests/travis/setup.sh +++ b/tests/travis/setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # https://travis-ci.community/t/macos-build-fails-because-of-homebrew-bundle-unknown-command/7296/27 -brew install cmake libomp lz4 +brew install cmake libomp if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_sdist_test" ]; then