MNT/TST Replace boston by synthetic dataset in ensemble test_forest (s…

…cikit-learn#16927)
jayzed82 · Oct 22, 2020 · 9399227 · 9399227
1 parent 42e5713
commit 9399227
Showing 1 changed file with 26 additions and 27 deletions.
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -72,12 +72,9 @@
 iris.data = iris.data[perm]
 iris.target = iris.target[perm]
 
-# also load the boston dataset
-# and randomly permute it
-boston = datasets.load_boston()
-perm = rng.permutation(boston.target.size)
-boston.data = boston.data[perm]
-boston.target = boston.target[perm]
+# Make regression dataset
+X_reg, y_reg = datasets.make_regression(n_samples=500, n_features=10,
+                                        random_state=1)
 
 # also make a hastie_10_2 dataset
 hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
@@ -159,29 +156,29 @@ def test_iris(name, criterion):
     check_iris_criterion(name, criterion)
 
 
-def check_boston_criterion(name, criterion):
-    # Check consistency on dataset boston house prices.
+def check_regression_criterion(name, criterion):
+    # Check consistency on regression dataset.
     ForestRegressor = FOREST_REGRESSORS[name]
 
     reg = ForestRegressor(n_estimators=5, criterion=criterion,
                           random_state=1)
-    reg.fit(boston.data, boston.target)
-    score = reg.score(boston.data, boston.target)
-    assert score > 0.94, ("Failed with max_features=None, criterion %s "
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert score > 0.93, ("Failed with max_features=None, criterion %s "
                           "and score = %f" % (criterion, score))
 
     reg = ForestRegressor(n_estimators=5, criterion=criterion,
                           max_features=6, random_state=1)
-    reg.fit(boston.data, boston.target)
-    score = reg.score(boston.data, boston.target)
-    assert score > 0.95, ("Failed with max_features=6, criterion %s "
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert score > 0.92, ("Failed with max_features=6, criterion %s "
                           "and score = %f" % (criterion, score))
 
 
 @pytest.mark.parametrize('name', FOREST_REGRESSORS)
 @pytest.mark.parametrize('criterion', ("mse", "mae", "friedman_mse"))
-def test_boston(name, criterion):
-    check_boston_criterion(name, criterion)
+def test_regression(name, criterion):
+    check_regression_criterion(name, criterion)
 
 
 def check_regressor_attributes(name):
@@ -384,12 +381,10 @@ def check_oob_score(name, X, y, n_estimators=20):
     n_samples = X.shape[0]
     est.fit(X[:n_samples // 2, :], y[:n_samples // 2])
     test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:])
-
     if name in FOREST_CLASSIFIERS:
         assert abs(test_score - est.oob_score_) < 0.1
     else:
-        assert test_score > est.oob_score_
-        assert est.oob_score_ > .8
+        assert abs(test_score - est.oob_score_) < 0.1
 
     # Check warning if not enough estimators
     with np.errstate(divide="ignore", invalid="ignore"):
@@ -411,10 +406,10 @@ def test_oob_score_classifiers(name):
 
 @pytest.mark.parametrize('name', FOREST_REGRESSORS)
 def test_oob_score_regressors(name):
-    check_oob_score(name, boston.data, boston.target, 50)
+    check_oob_score(name, X_reg, y_reg, 50)
 
     # csc matrix
-    check_oob_score(name, csc_matrix(boston.data), boston.target, 50)
+    check_oob_score(name, csc_matrix(X_reg), y_reg, 50)
 
 
 def check_oob_score_raise_error(name):
@@ -475,11 +470,13 @@ def check_parallel(name, X, y):
 @pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS)
 def test_parallel(name):
     if name in FOREST_CLASSIFIERS:
-        ds = iris
+        X = iris.data
+        y = iris.target
     elif name in FOREST_REGRESSORS:
-        ds = boston
+        X = X_reg
+        y = y_reg
 
-    check_parallel(name, ds.data, ds.target)
+    check_parallel(name, X, y)
 
 
 def check_pickle(name, X, y):
@@ -500,11 +497,13 @@ def check_pickle(name, X, y):
 @pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS)
 def test_pickle(name):
     if name in FOREST_CLASSIFIERS:
-        ds = iris
+        X = iris.data
+        y = iris.target
     elif name in FOREST_REGRESSORS:
-        ds = boston
+        X = X_reg
+        y = y_reg
 
-    check_pickle(name, ds.data[::2], ds.target[::2])
+    check_pickle(name, X[::2], y[::2])
 
 
 def check_multioutput(name):