Skip to content

Commit

Permalink
Expose external memory DMatrix to Python.
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Jul 16, 2021
1 parent bd1f3a3 commit 2fbdab5
Show file tree
Hide file tree
Showing 25 changed files with 813 additions and 142 deletions.
16 changes: 10 additions & 6 deletions .github/workflows/main.yml
Expand Up @@ -95,36 +95,40 @@ jobs:
cd build
cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja
ninja -v install
cd -
- name: Build and run C API demo with static
shell: bash -l {0}
run: |
pushd .
cd demo/c-api/
mkdir build
cd build
cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
ninja -v
ctest
cd ..
./build/api-demo
rm -rf ./build
cd ../..
popd
- name: Build and install XGBoost shared library
shell: bash -l {0}
run: |
cd build
cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja
ninja -v install
cd -
- name: Build and run C API demo with shared
shell: bash -l {0}
run: |
pushd .
cd demo/c-api/
mkdir build
cd build
cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
ninja -v
cd ..
./build/api-demo
cd ../../
./tests/ci_build/verify_link.sh ./demo/c-api/build/api-demo
ctest
popd
./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo
./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo
lint:
runs-on: ubuntu-latest
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Expand Up @@ -92,7 +92,10 @@ endif
mypy:
cd python-package; \
mypy ./xgboost/dask.py && \
mypy ../demo/guide-python/external_memory.py && \
mypy ../tests/python-gpu/test_gpu_with_dask.py && \
mypy ../tests/python/test_data_iterator.py && \
mypy ../tests/python-gpu/test_gpu_data_iterator.py && \
mypy ./xgboost/sklearn.py || exit 1; \
mypy . || true ;

Expand Down
25 changes: 14 additions & 11 deletions demo/c-api/CMakeLists.txt
@@ -1,14 +1,17 @@
cmake_minimum_required(VERSION 3.13)
project(api-demo LANGUAGES C VERSION 0.0.1)
find_package(xgboost REQUIRED)
project(xgboost-c-examples)

# xgboost is built as static libraries, all cxx dependencies need to be linked into the
# executable.
if (XGBOOST_BUILD_STATIC_LIB)
enable_language(CXX)
# find again for those cxx libraries.
find_package(xgboost REQUIRED)
endif(XGBOOST_BUILD_STATIC_LIB)
add_subdirectory(basic)
add_subdirectory(external-memory)

add_executable(api-demo c-api-demo.c)
target_link_libraries(api-demo PRIVATE xgboost::xgboost)
enable_testing()
add_test(
NAME test_xgboost_demo_c_basic
COMMAND api-demo
WORKING_DIRECTORY ${xgboost-c-examples_BINARY_DIR}
)
add_test(
NAME test_xgboost_demo_c_external_memory
COMMAND external-memory-demo
WORKING_DIRECTORY ${xgboost-c-examples_BINARY_DIR}
)
13 changes: 13 additions & 0 deletions demo/c-api/basic/CMakeLists.txt
@@ -0,0 +1,13 @@
project(api-demo LANGUAGES C VERSION 0.0.1)
find_package(xgboost REQUIRED)

# xgboost is built as static libraries, all cxx dependencies need to be linked into the
# executable.
if (XGBOOST_BUILD_STATIC_LIB)
enable_language(CXX)
# find again for those cxx libraries.
find_package(xgboost REQUIRED)
endif(XGBOOST_BUILD_STATIC_LIB)

add_executable(api-demo c-api-demo.c)
target_link_libraries(api-demo PRIVATE xgboost::xgboost)
File renamed without changes.
File renamed without changes.
4 changes: 2 additions & 2 deletions demo/c-api/c-api-demo.c → demo/c-api/basic/c-api-demo.c
Expand Up @@ -24,8 +24,8 @@ int main(int argc, char** argv) {

// load the data
DMatrixHandle dtrain, dtest;
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.train", silent, &dtrain));
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.test", silent, &dtest));
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train", silent, &dtrain));
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test", silent, &dtest));

// create the booster
BoosterHandle booster;
Expand Down
7 changes: 7 additions & 0 deletions demo/c-api/external-memory/CMakeLists.txt
@@ -0,0 +1,7 @@
cmake_minimum_required(VERSION 3.13)
project(external-memory-demo LANGUAGES C VERSION 0.0.1)

find_package(xgboost REQUIRED)

add_executable(external-memory-demo external_memory.c)
target_link_libraries(external-memory-demo PRIVATE xgboost::xgboost)
10 changes: 10 additions & 0 deletions demo/c-api/external-memory/README.md
@@ -0,0 +1,10 @@
Data Callback
=============

A simple demo for using custom data iterator with XGBoost. The primary function for this
is external-memory training with user provided data loaders. In the example, we have
defined a custom data iterator with 2 methods: `reset` and `next`. The `next` method
passes data into XGBoost and tells XGBoost whether the iterator has reached its end.
During training, XGBoost will generate some caches for internal data structures in current
directory, which can be changed by `cache_prefix` parameter during construction of
`DMatrix`.
179 changes: 179 additions & 0 deletions demo/c-api/external-memory/external_memory.c
@@ -0,0 +1,179 @@
/*!
* Copyright 2021 XGBoost contributors
*
* \brief A simple example of using xgboost data callback API.
*/

#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <xgboost/c_api.h>

#define safe_xgboost(err) \
if ((err) != 0) { \
fprintf(stderr, "%s:%d: error in %s: %s\n", __FILE__, __LINE__, #err, \
XGBGetLastError()); \
exit(1); \
}

#define N_BATCHS 32
#define BATCH_LEN 512

/* Shorthands. */
typedef DMatrixHandle DMatrix;
typedef BoosterHandle Booster;

typedef struct _DataIter {
/* Data of each batch. */
float **data;
/* Labels of each batch */
float **labels;
/* Length of each batch. */
size_t *lengths;
/* Total number of batches. */
size_t n;
/* Current iteration. */
size_t cur_it;

/* Private fields */
DMatrix _proxy;
char _array[128];
} DataIter;

#define safe_malloc(ptr) \
if ((ptr) == NULL) { \
fprintf(stderr, "%s:%d: Failed to allocate memory.\n", __FILE__, \
__LINE__); \
exit(1); \
}

/**
* Initialize with random data for demo. In practice the data should be loaded
* from external memory. We jsut demonstrate how to use the iterator in
* XGBoost.
*
* \param batch_size Number of elements for each batch. The demo here is only using 1
* column.
* \param n_batches Number of batches.
*/
void DataIterator_Init(DataIter *self, size_t batch_size, size_t n_batches) {
self->n = n_batches;

self->lengths = (size_t *)malloc(self->n * sizeof(size_t));
safe_malloc(self->lengths);
for (size_t i = 0; i < self->n; ++i) {
self->lengths[i] = batch_size;
}

self->data = (float **)malloc(self->n * sizeof(float *));
safe_malloc(self->data);
self->labels = (float **)malloc(self->n * sizeof(float *));
safe_malloc(self->labels);

/* Generate some random data. */
for (size_t i = 0; i < self->n; ++i) {
self->data[i] = (float *)malloc(self->lengths[i] * sizeof(float));
safe_malloc(self->data[i]);
for (size_t j = 0; j < self->lengths[i]; ++j) {
float x = (float)rand() / (float)(RAND_MAX);
self->data[i][j] = x;
}

self->labels[i] = (float *)malloc(self->lengths[i] * sizeof(float));
safe_malloc(self->labels[i]);
for (size_t j = 0; j < self->lengths[i]; ++j) {
float y = (float)rand() / (float)(RAND_MAX);
self->labels[i][j] = y;
}
}

self->cur_it = 0;
safe_xgboost(XGProxyDMatrixCreate(&self->_proxy));
}

void DataIterator_Free(DataIter *self) {
for (size_t i = 0; i < self->n; ++i) {
free(self->data[i]);
free(self->labels[i]);
}
free(self->data);
free(self->lengths);
safe_xgboost(XGDMatrixFree(self->_proxy));
};

int DataIterator_Next(DataIterHandle handle) {
DataIter *self = (DataIter *)(handle);
if (self->cur_it == self->n) {
self->cur_it = 0;
return 0; /* At end */
}

/* A JSON string encoding array interface (standard from numpy). */
char array[] = "{\"data\": [%lu, false], \"shape\":[%lu, 1], \"typestr\": "
"\"<f4\", \"version\": 3}";
memset(self->_array, '\0', sizeof(self->_array));
sprintf(self->_array, array, (size_t)self->data[self->cur_it],
self->lengths[self->cur_it]);

safe_xgboost(XGProxyDMatrixSetDataDense(self->_proxy, self->_array));
/* The data passed in the iterator must remain valid (not being freed until the next
* iteration or reset) */
safe_xgboost(XGDMatrixSetDenseInfo(self->_proxy, "label",
self->labels[self->cur_it],
self->lengths[self->cur_it], 1));
self->cur_it++;
return 1; /* Continue. */
}

void DataIterator_Reset(DataIterHandle handle) {
DataIter *self = (DataIter *)(handle);
self->cur_it = 0;
}

/**
* Train a regression model and save it into JSON model file.
*/
void TrainModel(DMatrix Xy) {
/* Create booster for training. */
Booster booster;
DMatrix cache[] = {Xy};
safe_xgboost(XGBoosterCreate(cache, 1, &booster));
/* Use approx for external memory training. */
safe_xgboost(XGBoosterSetParam(booster, "tree_method", "approx"));
safe_xgboost(XGBoosterSetParam(booster, "objective", "reg:squarederror"));

/* Start training. */
const char *validation_names[1] = {"train"};
const char *validation_result = NULL;
size_t n_rounds = 10;
for (size_t i = 0; i < n_rounds; ++i) {
safe_xgboost(XGBoosterUpdateOneIter(booster, i, Xy));
safe_xgboost(XGBoosterEvalOneIter(booster, i, cache, validation_names, 1,
&validation_result));
printf("%s\n", validation_result);
}

/* Save the model to a JSON file. */
safe_xgboost(XGBoosterSaveModel(booster, "model.json"));

safe_xgboost(XGBoosterFree(booster));
}

int main() {
DataIter iter;
DataIterator_Init(&iter, BATCH_LEN, N_BATCHS);

/* Create DMatrix from iterator. During training, some cache files with the
* prefix "cache-" will be generated in current directory */
char config[] = "{\"missing\": NaN, \"cache_prefix\": \"cache\"}";
DMatrix Xy;
safe_xgboost(XGDMatrixCreateFromCallback(
&iter, iter._proxy, DataIterator_Reset, DataIterator_Next, config, &Xy));

TrainModel(Xy);

safe_xgboost(XGDMatrixFree(Xy));

DataIterator_Free(&iter);
return 0;
}

0 comments on commit 2fbdab5

Please sign in to comment.