Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace boston in ensemble test_forest #16927

Merged
merged 11 commits into from May 20, 2020
Merged
40 changes: 20 additions & 20 deletions sklearn/ensemble/tests/test_forest.py
Expand Up @@ -72,12 +72,12 @@
iris.data = iris.data[perm]
iris.target = iris.target[perm]

# also load the boston dataset
# also load the diabetes dataset
# and randomly permute it
boston = datasets.load_boston()
perm = rng.permutation(boston.target.size)
boston.data = boston.data[perm]
boston.target = boston.target[perm]
diabetes = datasets.load_diabetes()
perm = rng.permutation(diabetes.target.size)
diabetes.data = diabetes.data[perm]
diabetes.target = diabetes.target[perm]

# also make a hastie_10_2 dataset
hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
Expand Down Expand Up @@ -159,29 +159,29 @@ def test_iris(name, criterion):
check_iris_criterion(name, criterion)


def check_boston_criterion(name, criterion):
# Check consistency on dataset boston house prices.
def check_diabetes_criterion(name, criterion):
# Check consistency on diabetes dataset.
ForestRegressor = FOREST_REGRESSORS[name]

clf = ForestRegressor(n_estimators=5, criterion=criterion,
random_state=1)
clf.fit(boston.data, boston.target)
score = clf.score(boston.data, boston.target)
assert score > 0.94, ("Failed with max_features=None, criterion %s "
clf.fit(diabetes.data, diabetes.target)
score = clf.score(diabetes.data, diabetes.target)
assert score > 0.86, ("Failed with max_features=None, criterion %s "
"and score = %f" % (criterion, score))

clf = ForestRegressor(n_estimators=5, criterion=criterion,
max_features=6, random_state=1)
clf.fit(boston.data, boston.target)
score = clf.score(boston.data, boston.target)
assert score > 0.95, ("Failed with max_features=6, criterion %s "
clf.fit(diabetes.data, diabetes.target)
score = clf.score(diabetes.data, diabetes.target)
assert score > 0.86, ("Failed with max_features=6, criterion %s "
"and score = %f" % (criterion, score))


@pytest.mark.parametrize('name', FOREST_REGRESSORS)
@pytest.mark.parametrize('criterion', ("mse", "mae", "friedman_mse"))
def test_boston(name, criterion):
check_boston_criterion(name, criterion)
def test_diabetes(name, criterion):
check_diabetes_criterion(name, criterion)


def check_regressor_attributes(name):
Expand Down Expand Up @@ -389,7 +389,7 @@ def check_oob_score(name, X, y, n_estimators=20):
assert abs(test_score - est.oob_score_) < 0.1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed we are already doing this for the classification. I think it makes sense to do that for the regression.
We only require a comment to mention that in the first case, this is a diff between accuracies and in the second one a diff between R2.

else:
assert test_score > est.oob_score_
assert est.oob_score_ > .8
assert est.oob_score_ > .32

# Check warning if not enough estimators
with np.errstate(divide="ignore", invalid="ignore"):
Expand All @@ -411,10 +411,10 @@ def test_oob_score_classifiers(name):

@pytest.mark.parametrize('name', FOREST_REGRESSORS)
def test_oob_score_regressors(name):
check_oob_score(name, boston.data, boston.target, 50)
check_oob_score(name, diabetes.data, diabetes.target, 50)

# csc matrix
check_oob_score(name, csc_matrix(boston.data), boston.target, 50)
check_oob_score(name, csc_matrix(diabetes.data), diabetes.target, 50)


def check_oob_score_raise_error(name):
Expand Down Expand Up @@ -477,7 +477,7 @@ def test_parallel(name):
if name in FOREST_CLASSIFIERS:
ds = iris
elif name in FOREST_REGRESSORS:
ds = boston
ds = diabetes

check_parallel(name, ds.data, ds.target)

Expand All @@ -502,7 +502,7 @@ def test_pickle(name):
if name in FOREST_CLASSIFIERS:
ds = iris
elif name in FOREST_REGRESSORS:
ds = boston
ds = diabetes

check_pickle(name, ds.data[::2], ds.target[::2])

Expand Down