Skip to content

Commit

Permalink
Export Python Interface for external memory. (#7070)
Browse files Browse the repository at this point in the history
* Add Python iterator interface.
* Add tests.
* Add demo.
* Add documents.
* Handle empty dataset.
  • Loading branch information
trivialfis committed Jul 22, 2021
1 parent e64ee65 commit e608836
Show file tree
Hide file tree
Showing 34 changed files with 961 additions and 200 deletions.
16 changes: 10 additions & 6 deletions .github/workflows/main.yml
Expand Up @@ -95,36 +95,40 @@ jobs:
cd build
cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja
ninja -v install
cd -
- name: Build and run C API demo with static
shell: bash -l {0}
run: |
pushd .
cd demo/c-api/
mkdir build
cd build
cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
ninja -v
ctest
cd ..
./build/api-demo
rm -rf ./build
cd ../..
popd
- name: Build and install XGBoost shared library
shell: bash -l {0}
run: |
cd build
cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja
ninja -v install
cd -
- name: Build and run C API demo with shared
shell: bash -l {0}
run: |
pushd .
cd demo/c-api/
mkdir build
cd build
cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
ninja -v
cd ..
./build/api-demo
cd ../../
./tests/ci_build/verify_link.sh ./demo/c-api/build/api-demo
ctest
popd
./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo
./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo
lint:
runs-on: ubuntu-latest
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Expand Up @@ -92,7 +92,10 @@ endif
mypy:
cd python-package; \
mypy ./xgboost/dask.py && \
mypy ../demo/guide-python/external_memory.py && \
mypy ../tests/python-gpu/test_gpu_with_dask.py && \
mypy ../tests/python/test_data_iterator.py && \
mypy ../tests/python-gpu/test_gpu_data_iterator.py && \
mypy ./xgboost/sklearn.py || exit 1; \
mypy . || true ;

Expand Down
25 changes: 14 additions & 11 deletions demo/c-api/CMakeLists.txt
@@ -1,14 +1,17 @@
cmake_minimum_required(VERSION 3.13)
project(api-demo LANGUAGES C VERSION 0.0.1)
find_package(xgboost REQUIRED)
project(xgboost-c-examples)

# xgboost is built as static libraries, all cxx dependencies need to be linked into the
# executable.
if (XGBOOST_BUILD_STATIC_LIB)
enable_language(CXX)
# find again for those cxx libraries.
find_package(xgboost REQUIRED)
endif(XGBOOST_BUILD_STATIC_LIB)
add_subdirectory(basic)
add_subdirectory(external-memory)

add_executable(api-demo c-api-demo.c)
target_link_libraries(api-demo PRIVATE xgboost::xgboost)
enable_testing()
add_test(
NAME test_xgboost_demo_c_basic
COMMAND api-demo
WORKING_DIRECTORY ${xgboost-c-examples_BINARY_DIR}
)
add_test(
NAME test_xgboost_demo_c_external_memory
COMMAND external-memory-demo
WORKING_DIRECTORY ${xgboost-c-examples_BINARY_DIR}
)
13 changes: 13 additions & 0 deletions demo/c-api/basic/CMakeLists.txt
@@ -0,0 +1,13 @@
project(api-demo LANGUAGES C VERSION 0.0.1)
find_package(xgboost REQUIRED)

# xgboost is built as static libraries, all cxx dependencies need to be linked into the
# executable.
if (XGBOOST_BUILD_STATIC_LIB)
enable_language(CXX)
# find again for those cxx libraries.
find_package(xgboost REQUIRED)
endif(XGBOOST_BUILD_STATIC_LIB)

add_executable(api-demo c-api-demo.c)
target_link_libraries(api-demo PRIVATE xgboost::xgboost)
File renamed without changes.
File renamed without changes.
4 changes: 2 additions & 2 deletions demo/c-api/c-api-demo.c → demo/c-api/basic/c-api-demo.c
Expand Up @@ -24,8 +24,8 @@ int main(int argc, char** argv) {

// load the data
DMatrixHandle dtrain, dtest;
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.train", silent, &dtrain));
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.test", silent, &dtest));
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train", silent, &dtrain));
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test", silent, &dtest));

// create the booster
BoosterHandle booster;
Expand Down
7 changes: 7 additions & 0 deletions demo/c-api/external-memory/CMakeLists.txt
@@ -0,0 +1,7 @@
cmake_minimum_required(VERSION 3.13)
project(external-memory-demo LANGUAGES C VERSION 0.0.1)

find_package(xgboost REQUIRED)

add_executable(external-memory-demo external_memory.c)
target_link_libraries(external-memory-demo PRIVATE xgboost::xgboost)
16 changes: 16 additions & 0 deletions demo/c-api/external-memory/README.md
@@ -0,0 +1,16 @@
Defining a Custom Data Iterator to Load Data from External Memory
=================================================================

A simple demo for using custom data iterator with XGBoost. The feature is still
**experimental** and not ready for production use. If you are not familiar with C API,
please read its introduction in our tutorials and visit the basic demo first.

Defining Data Iterator
----------------------

In the example, we define a custom data iterator with 2 methods: `reset` and `next`. The
`next` method passes data into XGBoost and tells XGBoost whether the iterator has reached
its end, and the `reset` method resets iterations. One important detail when using the C
API for data iterator is users need to make sure that the data passed into `next` method
must be kept in memory until the next iteration or `reset` is called. The external memory
DMatrix is not limited to training, but also valid for other features like prediction.
179 changes: 179 additions & 0 deletions demo/c-api/external-memory/external_memory.c
@@ -0,0 +1,179 @@
/*!
* Copyright 2021 XGBoost contributors
*
* \brief A simple example of using xgboost data callback API.
*/

#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <xgboost/c_api.h>

#define safe_xgboost(err) \
if ((err) != 0) { \
fprintf(stderr, "%s:%d: error in %s: %s\n", __FILE__, __LINE__, #err, \
XGBGetLastError()); \
exit(1); \
}

#define N_BATCHS 32
#define BATCH_LEN 512

/* Shorthands. */
typedef DMatrixHandle DMatrix;
typedef BoosterHandle Booster;

typedef struct _DataIter {
/* Data of each batch. */
float **data;
/* Labels of each batch */
float **labels;
/* Length of each batch. */
size_t *lengths;
/* Total number of batches. */
size_t n;
/* Current iteration. */
size_t cur_it;

/* Private fields */
DMatrix _proxy;
char _array[128];
} DataIter;

#define safe_malloc(ptr) \
if ((ptr) == NULL) { \
fprintf(stderr, "%s:%d: Failed to allocate memory.\n", __FILE__, \
__LINE__); \
exit(1); \
}

/**
* Initialize with random data for demo. In practice the data should be loaded
* from external memory. We just demonstrate how to use the iterator in
* XGBoost.
*
* \param batch_size Number of elements for each batch. The demo here is only using 1
* column.
* \param n_batches Number of batches.
*/
void DataIterator_Init(DataIter *self, size_t batch_size, size_t n_batches) {
self->n = n_batches;

self->lengths = (size_t *)malloc(self->n * sizeof(size_t));
safe_malloc(self->lengths);
for (size_t i = 0; i < self->n; ++i) {
self->lengths[i] = batch_size;
}

self->data = (float **)malloc(self->n * sizeof(float *));
safe_malloc(self->data);
self->labels = (float **)malloc(self->n * sizeof(float *));
safe_malloc(self->labels);

/* Generate some random data. */
for (size_t i = 0; i < self->n; ++i) {
self->data[i] = (float *)malloc(self->lengths[i] * sizeof(float));
safe_malloc(self->data[i]);
for (size_t j = 0; j < self->lengths[i]; ++j) {
float x = (float)rand() / (float)(RAND_MAX);
self->data[i][j] = x;
}

self->labels[i] = (float *)malloc(self->lengths[i] * sizeof(float));
safe_malloc(self->labels[i]);
for (size_t j = 0; j < self->lengths[i]; ++j) {
float y = (float)rand() / (float)(RAND_MAX);
self->labels[i][j] = y;
}
}

self->cur_it = 0;
safe_xgboost(XGProxyDMatrixCreate(&self->_proxy));
}

void DataIterator_Free(DataIter *self) {
for (size_t i = 0; i < self->n; ++i) {
free(self->data[i]);
free(self->labels[i]);
}
free(self->data);
free(self->lengths);
safe_xgboost(XGDMatrixFree(self->_proxy));
};

int DataIterator_Next(DataIterHandle handle) {
DataIter *self = (DataIter *)(handle);
if (self->cur_it == self->n) {
self->cur_it = 0;
return 0; /* At end */
}

/* A JSON string encoding array interface (standard from numpy). */
char array[] = "{\"data\": [%lu, false], \"shape\":[%lu, 1], \"typestr\": "
"\"<f4\", \"version\": 3}";
memset(self->_array, '\0', sizeof(self->_array));
sprintf(self->_array, array, (size_t)self->data[self->cur_it],
self->lengths[self->cur_it]);

safe_xgboost(XGProxyDMatrixSetDataDense(self->_proxy, self->_array));
/* The data passed in the iterator must remain valid (not being freed until the next
* iteration or reset) */
safe_xgboost(XGDMatrixSetDenseInfo(self->_proxy, "label",
self->labels[self->cur_it],
self->lengths[self->cur_it], 1));
self->cur_it++;
return 1; /* Continue. */
}

void DataIterator_Reset(DataIterHandle handle) {
DataIter *self = (DataIter *)(handle);
self->cur_it = 0;
}

/**
* Train a regression model and save it into JSON model file.
*/
void TrainModel(DMatrix Xy) {
/* Create booster for training. */
Booster booster;
DMatrix cache[] = {Xy};
safe_xgboost(XGBoosterCreate(cache, 1, &booster));
/* Use approx for external memory training. */
safe_xgboost(XGBoosterSetParam(booster, "tree_method", "approx"));
safe_xgboost(XGBoosterSetParam(booster, "objective", "reg:squarederror"));

/* Start training. */
const char *validation_names[1] = {"train"};
const char *validation_result = NULL;
size_t n_rounds = 10;
for (size_t i = 0; i < n_rounds; ++i) {
safe_xgboost(XGBoosterUpdateOneIter(booster, i, Xy));
safe_xgboost(XGBoosterEvalOneIter(booster, i, cache, validation_names, 1,
&validation_result));
printf("%s\n", validation_result);
}

/* Save the model to a JSON file. */
safe_xgboost(XGBoosterSaveModel(booster, "model.json"));

safe_xgboost(XGBoosterFree(booster));
}

int main() {
DataIter iter;
DataIterator_Init(&iter, BATCH_LEN, N_BATCHS);

/* Create DMatrix from iterator. During training, some cache files with the
* prefix "cache-" will be generated in current directory */
char config[] = "{\"missing\": NaN, \"cache_prefix\": \"cache\"}";
DMatrix Xy;
safe_xgboost(XGDMatrixCreateFromCallback(
&iter, iter._proxy, DataIterator_Reset, DataIterator_Next, config, &Xy));

TrainModel(Xy);

safe_xgboost(XGDMatrixFree(Xy));

DataIterator_Free(&iter);
return 0;
}

0 comments on commit e608836

Please sign in to comment.