Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve external memory demo. #7320

Merged
merged 1 commit into from Oct 17, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
37 changes: 15 additions & 22 deletions demo/guide-python/external_memory.py
Expand Up @@ -8,23 +8,24 @@
import os
import xgboost
from typing import Callable, List, Tuple
from sklearn.datasets import make_regression
import tempfile
import numpy as np


def make_batches(
n_samples_per_batch: int, n_features: int, n_batches: int
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
"""Generate random batches."""
X = []
y = []
n_samples_per_batch: int, n_features: int, n_batches: int, tmpdir: str,
) -> List[Tuple[str, str]]:
files: List[Tuple[str, str]] = []
rng = np.random.RandomState(1994)
for i in range(n_batches):
_X = rng.randn(n_samples_per_batch, n_features)
_y = rng.randn(n_samples_per_batch)
X.append(_X)
y.append(_y)
return X, y
X, y = make_regression(n_samples_per_batch, n_features, random_state=rng)
X_path = os.path.join(tmpdir, "X-" + str(i) + ".npy")
y_path = os.path.join(tmpdir, "y-" + str(i) + ".npy")
np.save(X_path, X)
np.save(y_path, y)
files.append((X_path, y_path))
return files


class Iterator(xgboost.DataIter):
Expand All @@ -38,8 +39,8 @@ def __init__(self, file_paths: List[Tuple[str, str]]):

def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
X_path, y_path = self._file_paths[self._it]
X = np.loadtxt(X_path)
y = np.loadtxt(y_path)
X = np.load(X_path)
y = np.load(y_path)
assert X.shape[0] == y.shape[0]
return X, y

Expand All @@ -66,15 +67,7 @@ def reset(self) -> None:

def main(tmpdir: str) -> xgboost.Booster:
# generate some random data for demo
batches = make_batches(1024, 17, 31)
files = []
for i, (X, y) in enumerate(zip(*batches)):
X_path = os.path.join(tmpdir, "X-" + str(i) + ".txt")
np.savetxt(X_path, X)
y_path = os.path.join(tmpdir, "y-" + str(i) + ".txt")
np.savetxt(y_path, y)
files.append((X_path, y_path))

files = make_batches(1024, 17, 31, tmpdir)
it = Iterator(files)
# For non-data arguments, specify it here once instead of passing them by the `next`
# method.
Expand All @@ -83,7 +76,7 @@ def main(tmpdir: str) -> xgboost.Booster:

# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some
# caveats. This is still an experimental feature.
booster = xgboost.train({"tree_method": "approx"}, Xy)
booster = xgboost.train({"tree_method": "approx"}, Xy, evals=[(Xy, "Train")])
return booster


Expand Down