reconnect moved files to git repo

This commit is contained in:
root
2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions

View File

@ -0,0 +1,976 @@
"""
Testing for the bagging ensemble module (sklearn.ensemble.bagging).
"""
# Author: Gilles Louppe
# License: BSD 3 clause
from itertools import cycle, product
import joblib
import numpy as np
import pytest
import sklearn
from sklearn.base import BaseEstimator
from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import (
AdaBoostClassifier,
AdaBoostRegressor,
BaggingClassifier,
BaggingRegressor,
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
RandomForestClassifier,
RandomForestRegressor,
)
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, scale
from sklearn.random_projection import SparseRandomProjection
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
rng = check_random_state(0)
# also load the iris dataset
# and randomly permute it
iris = load_iris()
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
# also load the diabetes dataset
# and randomly permute it
diabetes = load_diabetes()
perm = rng.permutation(diabetes.target.size)
diabetes.data = diabetes.data[perm]
diabetes.target = diabetes.target[perm]
def test_classification():
# Check classification for various parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=rng
)
grid = ParameterGrid(
{
"max_samples": [0.5, 1.0],
"max_features": [1, 4],
"bootstrap": [True, False],
"bootstrap_features": [True, False],
}
)
estimators = [
None,
DummyClassifier(),
Perceptron(max_iter=20),
DecisionTreeClassifier(max_depth=2),
KNeighborsClassifier(),
SVC(),
]
# Try different parameter settings with different base classifiers without
# doing the full cartesian product to keep the test durations low.
for params, estimator in zip(grid, cycle(estimators)):
BaggingClassifier(
estimator=estimator,
random_state=rng,
n_estimators=2,
**params,
).fit(X_train, y_train).predict(X_test)
@pytest.mark.parametrize(
"sparse_container, params, method",
product(
CSR_CONTAINERS + CSC_CONTAINERS,
[
{
"max_samples": 0.5,
"max_features": 2,
"bootstrap": True,
"bootstrap_features": True,
},
{
"max_samples": 1.0,
"max_features": 4,
"bootstrap": True,
"bootstrap_features": True,
},
{"max_features": 2, "bootstrap": False, "bootstrap_features": True},
{"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
],
["predict", "predict_proba", "predict_log_proba", "decision_function"],
),
)
def test_sparse_classification(sparse_container, params, method):
# Check classification for various parameter settings on sparse input.
class CustomSVC(SVC):
"""SVC variant that records the nature of the training set"""
def fit(self, X, y):
super().fit(X, y)
self.data_type_ = type(X)
return self
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
scale(iris.data), iris.target, random_state=rng
)
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
# Trained on sparse format
sparse_classifier = BaggingClassifier(
estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
random_state=1,
**params,
).fit(X_train_sparse, y_train)
sparse_results = getattr(sparse_classifier, method)(X_test_sparse)
# Trained on dense format
dense_classifier = BaggingClassifier(
estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
random_state=1,
**params,
).fit(X_train, y_train)
dense_results = getattr(dense_classifier, method)(X_test)
assert_array_almost_equal(sparse_results, dense_results)
sparse_type = type(X_train_sparse)
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert all([t == sparse_type for t in types])
def test_regression():
# Check regression for various parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data[:50], diabetes.target[:50], random_state=rng
)
grid = ParameterGrid(
{
"max_samples": [0.5, 1.0],
"max_features": [0.5, 1.0],
"bootstrap": [True, False],
"bootstrap_features": [True, False],
}
)
for estimator in [
None,
DummyRegressor(),
DecisionTreeRegressor(),
KNeighborsRegressor(),
SVR(),
]:
for params in grid:
BaggingRegressor(estimator=estimator, random_state=rng, **params).fit(
X_train, y_train
).predict(X_test)
@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
def test_sparse_regression(sparse_container):
# Check regression for various parameter settings on sparse input.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data[:50], diabetes.target[:50], random_state=rng
)
class CustomSVR(SVR):
"""SVC variant that records the nature of the training set"""
def fit(self, X, y):
super().fit(X, y)
self.data_type_ = type(X)
return self
parameter_sets = [
{
"max_samples": 0.5,
"max_features": 2,
"bootstrap": True,
"bootstrap_features": True,
},
{
"max_samples": 1.0,
"max_features": 4,
"bootstrap": True,
"bootstrap_features": True,
},
{"max_features": 2, "bootstrap": False, "bootstrap_features": True},
{"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
]
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
for params in parameter_sets:
# Trained on sparse format
sparse_classifier = BaggingRegressor(
estimator=CustomSVR(), random_state=1, **params
).fit(X_train_sparse, y_train)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_results = (
BaggingRegressor(estimator=CustomSVR(), random_state=1, **params)
.fit(X_train, y_train)
.predict(X_test)
)
sparse_type = type(X_train_sparse)
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert_array_almost_equal(sparse_results, dense_results)
assert all([t == sparse_type for t in types])
assert_array_almost_equal(sparse_results, dense_results)
class DummySizeEstimator(BaseEstimator):
def fit(self, X, y):
self.training_size_ = X.shape[0]
self.training_hash_ = joblib.hash(X)
def predict(self, X):
return np.ones(X.shape[0])
def test_bootstrap_samples():
# Test that bootstrapping samples generate non-perfect base estimators.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data, diabetes.target, random_state=rng
)
estimator = DecisionTreeRegressor().fit(X_train, y_train)
# without bootstrap, all trees are perfect on the training set
ensemble = BaggingRegressor(
estimator=DecisionTreeRegressor(),
max_samples=1.0,
bootstrap=False,
random_state=rng,
).fit(X_train, y_train)
assert estimator.score(X_train, y_train) == ensemble.score(X_train, y_train)
# with bootstrap, trees are no longer perfect on the training set
ensemble = BaggingRegressor(
estimator=DecisionTreeRegressor(),
max_samples=1.0,
bootstrap=True,
random_state=rng,
).fit(X_train, y_train)
assert estimator.score(X_train, y_train) > ensemble.score(X_train, y_train)
# check that each sampling correspond to a complete bootstrap resample.
# the size of each bootstrap should be the same as the input data but
# the data should be different (checked using the hash of the data).
ensemble = BaggingRegressor(estimator=DummySizeEstimator(), bootstrap=True).fit(
X_train, y_train
)
training_hash = []
for estimator in ensemble.estimators_:
assert estimator.training_size_ == X_train.shape[0]
training_hash.append(estimator.training_hash_)
assert len(set(training_hash)) == len(training_hash)
def test_bootstrap_features():
# Test that bootstrapping features may generate duplicate features.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data, diabetes.target, random_state=rng
)
ensemble = BaggingRegressor(
estimator=DecisionTreeRegressor(),
max_features=1.0,
bootstrap_features=False,
random_state=rng,
).fit(X_train, y_train)
for features in ensemble.estimators_features_:
assert diabetes.data.shape[1] == np.unique(features).shape[0]
ensemble = BaggingRegressor(
estimator=DecisionTreeRegressor(),
max_features=1.0,
bootstrap_features=True,
random_state=rng,
).fit(X_train, y_train)
for features in ensemble.estimators_features_:
assert diabetes.data.shape[1] > np.unique(features).shape[0]
def test_probability():
# Predict probabilities.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=rng
)
with np.errstate(divide="ignore", invalid="ignore"):
# Normal case
ensemble = BaggingClassifier(
estimator=DecisionTreeClassifier(), random_state=rng
).fit(X_train, y_train)
assert_array_almost_equal(
np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
)
assert_array_almost_equal(
ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
)
# Degenerate case, where some classes are missing
ensemble = BaggingClassifier(
estimator=LogisticRegression(), random_state=rng, max_samples=5
).fit(X_train, y_train)
assert_array_almost_equal(
np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
)
assert_array_almost_equal(
ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
)
def test_oob_score_classification():
# Check that oob prediction is a good estimation of the generalization
# error.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=rng
)
for estimator in [DecisionTreeClassifier(), SVC()]:
clf = BaggingClassifier(
estimator=estimator,
n_estimators=100,
bootstrap=True,
oob_score=True,
random_state=rng,
).fit(X_train, y_train)
test_score = clf.score(X_test, y_test)
assert abs(test_score - clf.oob_score_) < 0.1
# Test with few estimators
warn_msg = (
"Some inputs do not have OOB scores. This probably means too few "
"estimators were used to compute any reliable oob estimates."
)
with pytest.warns(UserWarning, match=warn_msg):
clf = BaggingClassifier(
estimator=estimator,
n_estimators=1,
bootstrap=True,
oob_score=True,
random_state=rng,
)
clf.fit(X_train, y_train)
def test_oob_score_regression():
# Check that oob prediction is a good estimation of the generalization
# error.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data, diabetes.target, random_state=rng
)
clf = BaggingRegressor(
estimator=DecisionTreeRegressor(),
n_estimators=50,
bootstrap=True,
oob_score=True,
random_state=rng,
).fit(X_train, y_train)
test_score = clf.score(X_test, y_test)
assert abs(test_score - clf.oob_score_) < 0.1
# Test with few estimators
warn_msg = (
"Some inputs do not have OOB scores. This probably means too few "
"estimators were used to compute any reliable oob estimates."
)
with pytest.warns(UserWarning, match=warn_msg):
regr = BaggingRegressor(
estimator=DecisionTreeRegressor(),
n_estimators=1,
bootstrap=True,
oob_score=True,
random_state=rng,
)
regr.fit(X_train, y_train)
def test_single_estimator():
# Check singleton ensembles.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data, diabetes.target, random_state=rng
)
clf1 = BaggingRegressor(
estimator=KNeighborsRegressor(),
n_estimators=1,
bootstrap=False,
bootstrap_features=False,
random_state=rng,
).fit(X_train, y_train)
clf2 = KNeighborsRegressor().fit(X_train, y_train)
assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_error():
# Test support of decision_function
X, y = iris.data, iris.target
base = DecisionTreeClassifier()
assert not hasattr(BaggingClassifier(base).fit(X, y), "decision_function")
def test_parallel_classification():
# Check parallel classification.
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=0
)
ensemble = BaggingClassifier(
DecisionTreeClassifier(), n_jobs=3, random_state=0
).fit(X_train, y_train)
# predict_proba
y1 = ensemble.predict_proba(X_test)
ensemble.set_params(n_jobs=1)
y2 = ensemble.predict_proba(X_test)
assert_array_almost_equal(y1, y2)
ensemble = BaggingClassifier(
DecisionTreeClassifier(), n_jobs=1, random_state=0
).fit(X_train, y_train)
y3 = ensemble.predict_proba(X_test)
assert_array_almost_equal(y1, y3)
# decision_function
ensemble = BaggingClassifier(
SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0
).fit(X_train, y_train)
decisions1 = ensemble.decision_function(X_test)
ensemble.set_params(n_jobs=1)
decisions2 = ensemble.decision_function(X_test)
assert_array_almost_equal(decisions1, decisions2)
ensemble = BaggingClassifier(
SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0
).fit(X_train, y_train)
decisions3 = ensemble.decision_function(X_test)
assert_array_almost_equal(decisions1, decisions3)
def test_parallel_regression():
# Check parallel regression.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data, diabetes.target, random_state=rng
)
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
X_train, y_train
)
ensemble.set_params(n_jobs=1)
y1 = ensemble.predict(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y2)
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(
X_train, y_train
)
y3 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y3)
def test_gridsearch():
# Check that bagging ensembles can be grid-searched.
# Transform iris into a binary classification task
X, y = iris.data, iris.target
y[y == 2] = 1
# Grid search with scoring based on decision_function
parameters = {"n_estimators": (1, 2), "estimator__C": (1, 2)}
GridSearchCV(BaggingClassifier(SVC()), parameters, scoring="roc_auc").fit(X, y)
def test_estimator():
# Check estimator and its default values.
rng = check_random_state(0)
# Classification
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=rng
)
ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train)
assert isinstance(ensemble.estimator_, DecisionTreeClassifier)
ensemble = BaggingClassifier(
DecisionTreeClassifier(), n_jobs=3, random_state=0
).fit(X_train, y_train)
assert isinstance(ensemble.estimator_, DecisionTreeClassifier)
ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(
X_train, y_train
)
assert isinstance(ensemble.estimator_, Perceptron)
# Regression
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data, diabetes.target, random_state=rng
)
ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train)
assert isinstance(ensemble.estimator_, DecisionTreeRegressor)
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
X_train, y_train
)
assert isinstance(ensemble.estimator_, DecisionTreeRegressor)
ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train)
assert isinstance(ensemble.estimator_, SVR)
def test_bagging_with_pipeline():
estimator = BaggingClassifier(
make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2
)
estimator.fit(iris.data, iris.target)
assert isinstance(estimator[0].steps[-1][1].random_state, int)
class DummyZeroEstimator(BaseEstimator):
def fit(self, X, y):
self.classes_ = np.unique(y)
return self
def predict(self, X):
return self.classes_[np.zeros(X.shape[0], dtype=int)]
def test_bagging_sample_weight_unsupported_but_passed():
estimator = BaggingClassifier(DummyZeroEstimator())
rng = check_random_state(0)
estimator.fit(iris.data, iris.target).predict(iris.data)
with pytest.raises(ValueError):
estimator.fit(
iris.data,
iris.target,
sample_weight=rng.randint(10, size=(iris.data.shape[0])),
)
def test_warm_start(random_state=42):
# Test if fitting incrementally with warm start gives a forest of the
# right size and the same results as a normal fit.
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf_ws = None
for n_estimators in [5, 10]:
if clf_ws is None:
clf_ws = BaggingClassifier(
n_estimators=n_estimators, random_state=random_state, warm_start=True
)
else:
clf_ws.set_params(n_estimators=n_estimators)
clf_ws.fit(X, y)
assert len(clf_ws) == n_estimators
clf_no_ws = BaggingClassifier(
n_estimators=10, random_state=random_state, warm_start=False
)
clf_no_ws.fit(X, y)
assert set([tree.random_state for tree in clf_ws]) == set(
[tree.random_state for tree in clf_no_ws]
)
def test_warm_start_smaller_n_estimators():
# Test if warm start'ed second fit with smaller n_estimators raises error.
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf = BaggingClassifier(n_estimators=5, warm_start=True)
clf.fit(X, y)
clf.set_params(n_estimators=4)
with pytest.raises(ValueError):
clf.fit(X, y)
def test_warm_start_equal_n_estimators():
# Test that nothing happens when fitting without increasing n_estimators
X, y = make_hastie_10_2(n_samples=20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# modify X to nonsense values, this should not change anything
X_train += 1.0
warn_msg = "Warm-start fitting without increasing n_estimators does not"
with pytest.warns(UserWarning, match=warn_msg):
clf.fit(X_train, y_train)
assert_array_equal(y_pred, clf.predict(X_test))
def test_warm_start_equivalence():
# warm started classifier with 5+5 estimators should be equivalent to
# one classifier with 10 estimators
X, y = make_hastie_10_2(n_samples=20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141)
clf_ws.fit(X_train, y_train)
clf_ws.set_params(n_estimators=10)
clf_ws.fit(X_train, y_train)
y1 = clf_ws.predict(X_test)
clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141)
clf.fit(X_train, y_train)
y2 = clf.predict(X_test)
assert_array_almost_equal(y1, y2)
def test_warm_start_with_oob_score_fails():
# Check using oob_score and warm_start simultaneously fails
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
with pytest.raises(ValueError):
clf.fit(X, y)
def test_oob_score_removed_on_warm_start():
X, y = make_hastie_10_2(n_samples=100, random_state=1)
clf = BaggingClassifier(n_estimators=5, oob_score=True)
clf.fit(X, y)
clf.set_params(warm_start=True, oob_score=False, n_estimators=10)
clf.fit(X, y)
with pytest.raises(AttributeError):
getattr(clf, "oob_score_")
def test_oob_score_consistency():
# Make sure OOB scores are identical when random_state, estimator, and
# training data are fixed and fitting is done twice
X, y = make_hastie_10_2(n_samples=200, random_state=1)
bagging = BaggingClassifier(
KNeighborsClassifier(),
max_samples=0.5,
max_features=0.5,
oob_score=True,
random_state=1,
)
assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
def test_estimators_samples():
# Check that format of estimators_samples_ is correct and that results
# generated at fit time can be identically reproduced at a later time
# using data saved in object attributes.
X, y = make_hastie_10_2(n_samples=200, random_state=1)
bagging = BaggingClassifier(
LogisticRegression(),
max_samples=0.5,
max_features=0.5,
random_state=1,
bootstrap=False,
)
bagging.fit(X, y)
# Get relevant attributes
estimators_samples = bagging.estimators_samples_
estimators_features = bagging.estimators_features_
estimators = bagging.estimators_
# Test for correct formatting
assert len(estimators_samples) == len(estimators)
assert len(estimators_samples[0]) == len(X) // 2
assert estimators_samples[0].dtype.kind == "i"
# Re-fit single estimator to test for consistent sampling
estimator_index = 0
estimator_samples = estimators_samples[estimator_index]
estimator_features = estimators_features[estimator_index]
estimator = estimators[estimator_index]
X_train = (X[estimator_samples])[:, estimator_features]
y_train = y[estimator_samples]
orig_coefs = estimator.coef_
estimator.fit(X_train, y_train)
new_coefs = estimator.coef_
assert_array_almost_equal(orig_coefs, new_coefs)
def test_estimators_samples_deterministic():
# This test is a regression test to check that with a random step
# (e.g. SparseRandomProjection) and a given random state, the results
# generated at fit time can be identically reproduced at a later time using
# data saved in object attributes. Check issue #9524 for full discussion.
iris = load_iris()
X, y = iris.data, iris.target
base_pipeline = make_pipeline(
SparseRandomProjection(n_components=2), LogisticRegression()
)
clf = BaggingClassifier(estimator=base_pipeline, max_samples=0.5, random_state=0)
clf.fit(X, y)
pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()
estimator = clf.estimators_[0]
estimator_sample = clf.estimators_samples_[0]
estimator_feature = clf.estimators_features_[0]
X_train = (X[estimator_sample])[:, estimator_feature]
y_train = y[estimator_sample]
estimator.fit(X_train, y_train)
assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
def test_max_samples_consistency():
# Make sure validated max_samples and original max_samples are identical
# when valid integer max_samples supplied by user
max_samples = 100
X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1)
bagging = BaggingClassifier(
KNeighborsClassifier(),
max_samples=max_samples,
max_features=0.5,
random_state=1,
)
bagging.fit(X, y)
assert bagging._max_samples == max_samples
def test_set_oob_score_label_encoding():
# Make sure the oob_score doesn't change when the labels change
# See: https://github.com/scikit-learn/scikit-learn/issues/8933
random_state = 5
X = [[-1], [0], [1]] * 5
Y1 = ["A", "B", "C"] * 5
Y2 = [-1, 0, 1] * 5
Y3 = [0, 1, 2] * 5
x1 = (
BaggingClassifier(oob_score=True, random_state=random_state)
.fit(X, Y1)
.oob_score_
)
x2 = (
BaggingClassifier(oob_score=True, random_state=random_state)
.fit(X, Y2)
.oob_score_
)
x3 = (
BaggingClassifier(oob_score=True, random_state=random_state)
.fit(X, Y3)
.oob_score_
)
assert [x1, x2] == [x3, x3]
def replace(X):
X = X.astype("float", copy=True)
X[~np.isfinite(X)] = 0
return X
def test_bagging_regressor_with_missing_inputs():
# Check that BaggingRegressor can accept X with missing/infinite data
X = np.array(
[
[1, 3, 5],
[2, None, 6],
[2, np.nan, 6],
[2, np.inf, 6],
[2, -np.inf, 6],
]
)
y_values = [
np.array([2, 3, 3, 3, 3]),
np.array(
[
[2, 1, 9],
[3, 6, 8],
[3, 6, 8],
[3, 6, 8],
[3, 6, 8],
]
),
]
for y in y_values:
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(FunctionTransformer(replace), regressor)
pipeline.fit(X, y).predict(X)
bagging_regressor = BaggingRegressor(pipeline)
y_hat = bagging_regressor.fit(X, y).predict(X)
assert y.shape == y_hat.shape
# Verify that exceptions can be raised by wrapper regressor
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(regressor)
with pytest.raises(ValueError):
pipeline.fit(X, y)
bagging_regressor = BaggingRegressor(pipeline)
with pytest.raises(ValueError):
bagging_regressor.fit(X, y)
def test_bagging_classifier_with_missing_inputs():
# Check that BaggingClassifier can accept X with missing/infinite data
X = np.array(
[
[1, 3, 5],
[2, None, 6],
[2, np.nan, 6],
[2, np.inf, 6],
[2, -np.inf, 6],
]
)
y = np.array([3, 6, 6, 6, 6])
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(FunctionTransformer(replace), classifier)
pipeline.fit(X, y).predict(X)
bagging_classifier = BaggingClassifier(pipeline)
bagging_classifier.fit(X, y)
y_hat = bagging_classifier.predict(X)
assert y.shape == y_hat.shape
bagging_classifier.predict_log_proba(X)
bagging_classifier.predict_proba(X)
# Verify that exceptions can be raised by wrapper classifier
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(classifier)
with pytest.raises(ValueError):
pipeline.fit(X, y)
bagging_classifier = BaggingClassifier(pipeline)
with pytest.raises(ValueError):
bagging_classifier.fit(X, y)
def test_bagging_small_max_features():
# Check that Bagging estimator can accept low fractional max_features
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 0])
bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1)
bagging.fit(X, y)
def test_bagging_get_estimators_indices():
# Check that Bagging estimator can generate sample indices properly
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/16436
rng = np.random.RandomState(0)
X = rng.randn(13, 4)
y = np.arange(13)
class MyEstimator(DecisionTreeRegressor):
"""An estimator which stores y indices information at fit."""
def fit(self, X, y):
self._sample_indices = y
clf = BaggingRegressor(estimator=MyEstimator(), n_estimators=1, random_state=0)
clf.fit(X, y)
assert_array_equal(clf.estimators_[0]._sample_indices, clf.estimators_samples_[0])
@pytest.mark.parametrize(
"bagging, expected_allow_nan",
[
(BaggingClassifier(HistGradientBoostingClassifier(max_iter=1)), True),
(BaggingRegressor(HistGradientBoostingRegressor(max_iter=1)), True),
(BaggingClassifier(LogisticRegression()), False),
(BaggingRegressor(SVR()), False),
],
)
def test_bagging_allow_nan_tag(bagging, expected_allow_nan):
"""Check that bagging inherits allow_nan tag."""
assert bagging._get_tags()["allow_nan"] == expected_allow_nan
@pytest.mark.parametrize(
"model",
[
BaggingClassifier(
estimator=RandomForestClassifier(n_estimators=1), n_estimators=1
),
BaggingRegressor(
estimator=RandomForestRegressor(n_estimators=1), n_estimators=1
),
],
)
def test_bagging_with_metadata_routing(model):
"""Make sure that metadata routing works with non-default estimator."""
with sklearn.config_context(enable_metadata_routing=True):
model.fit(iris.data, iris.target)
@pytest.mark.parametrize(
"model",
[
BaggingClassifier(
estimator=AdaBoostClassifier(n_estimators=1, algorithm="SAMME"),
n_estimators=1,
),
BaggingRegressor(estimator=AdaBoostRegressor(n_estimators=1), n_estimators=1),
],
)
def test_bagging_without_support_metadata_routing(model):
"""Make sure that we still can use an estimator that does not implement the
metadata routing."""
model.fit(iris.data, iris.target)

View File

@ -0,0 +1,109 @@
"""
Testing for the base module (sklearn.ensemble.base).
"""
# Authors: Gilles Louppe
# License: BSD 3 clause
from collections import OrderedDict
import numpy as np
from sklearn.datasets import load_iris
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble._base import _set_random_states
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
def test_base():
# Check BaseEnsemble methods.
ensemble = BaggingClassifier(
estimator=Perceptron(random_state=None), n_estimators=3
)
iris = load_iris()
ensemble.fit(iris.data, iris.target)
ensemble.estimators_ = [] # empty the list and create estimators manually
ensemble._make_estimator()
random_state = np.random.RandomState(3)
ensemble._make_estimator(random_state=random_state)
ensemble._make_estimator(random_state=random_state)
ensemble._make_estimator(append=False)
assert 3 == len(ensemble)
assert 3 == len(ensemble.estimators_)
assert isinstance(ensemble[0], Perceptron)
assert ensemble[0].random_state is None
assert isinstance(ensemble[1].random_state, int)
assert isinstance(ensemble[2].random_state, int)
assert ensemble[1].random_state != ensemble[2].random_state
np_int_ensemble = BaggingClassifier(
estimator=Perceptron(), n_estimators=np.int32(3)
)
np_int_ensemble.fit(iris.data, iris.target)
def test_set_random_states():
# Linear Discriminant Analysis doesn't have random state: smoke test
_set_random_states(LinearDiscriminantAnalysis(), random_state=17)
clf1 = Perceptron(random_state=None)
assert clf1.random_state is None
# check random_state is None still sets
_set_random_states(clf1, None)
assert isinstance(clf1.random_state, int)
# check random_state fixes results in consistent initialisation
_set_random_states(clf1, 3)
assert isinstance(clf1.random_state, int)
clf2 = Perceptron(random_state=None)
_set_random_states(clf2, 3)
assert clf1.random_state == clf2.random_state
# nested random_state
def make_steps():
return [
("sel", SelectFromModel(Perceptron(random_state=None))),
("clf", Perceptron(random_state=None)),
]
est1 = Pipeline(make_steps())
_set_random_states(est1, 3)
assert isinstance(est1.steps[0][1].estimator.random_state, int)
assert isinstance(est1.steps[1][1].random_state, int)
assert (
est1.get_params()["sel__estimator__random_state"]
!= est1.get_params()["clf__random_state"]
)
# ensure multiple random_state parameters are invariant to get_params()
# iteration order
class AlphaParamPipeline(Pipeline):
def get_params(self, *args, **kwargs):
params = Pipeline.get_params(self, *args, **kwargs).items()
return OrderedDict(sorted(params))
class RevParamPipeline(Pipeline):
def get_params(self, *args, **kwargs):
params = Pipeline.get_params(self, *args, **kwargs).items()
return OrderedDict(sorted(params, reverse=True))
for cls in [AlphaParamPipeline, RevParamPipeline]:
est2 = cls(make_steps())
_set_random_states(est2, 3)
assert (
est1.get_params()["sel__estimator__random_state"]
== est2.get_params()["sel__estimator__random_state"]
)
assert (
est1.get_params()["clf__random_state"]
== est2.get_params()["clf__random_state"]
)

View File

@ -0,0 +1,262 @@
import numpy as np
import pytest
from sklearn.base import ClassifierMixin, clone, is_classifier
from sklearn.datasets import (
load_diabetes,
load_iris,
make_classification,
make_regression,
)
from sklearn.ensemble import (
RandomForestClassifier,
RandomForestRegressor,
StackingClassifier,
StackingRegressor,
VotingClassifier,
VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
X, y = load_iris(return_X_y=True)
X_r, y_r = load_diabetes(return_X_y=True)
@pytest.mark.parametrize(
"X, y, estimator",
[
(
*make_classification(n_samples=10),
StackingClassifier(
estimators=[
("lr", LogisticRegression()),
("svm", LinearSVC()),
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
],
cv=2,
),
),
(
*make_classification(n_samples=10),
VotingClassifier(
estimators=[
("lr", LogisticRegression()),
("svm", LinearSVC()),
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
]
),
),
(
*make_regression(n_samples=10),
StackingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", LinearSVR()),
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
],
cv=2,
),
),
(
*make_regression(n_samples=10),
VotingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", LinearSVR()),
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
]
),
),
],
ids=[
"stacking-classifier",
"voting-classifier",
"stacking-regressor",
"voting-regressor",
],
)
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
# check that the behavior of `estimators`, `estimators_`,
# `named_estimators`, `named_estimators_` is consistent across all
# ensemble classes and when using `set_params()`.
# before fit
assert "svm" in estimator.named_estimators
assert estimator.named_estimators.svm is estimator.estimators[1][1]
assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
# check fitted attributes
estimator.fit(X, y)
assert len(estimator.named_estimators) == 3
assert len(estimator.named_estimators_) == 3
assert sorted(list(estimator.named_estimators_.keys())) == sorted(
["lr", "svm", "rf"]
)
# check that set_params() does not add a new attribute
estimator_new_params = clone(estimator)
svm_estimator = SVC() if is_classifier(estimator) else SVR()
estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
assert not hasattr(estimator_new_params, "svm")
assert (
estimator_new_params.named_estimators.lr.get_params()
== estimator.named_estimators.lr.get_params()
)
assert (
estimator_new_params.named_estimators.rf.get_params()
== estimator.named_estimators.rf.get_params()
)
# check the behavior when setting an dropping an estimator
estimator_dropped = clone(estimator)
estimator_dropped.set_params(svm="drop")
estimator_dropped.fit(X, y)
assert len(estimator_dropped.named_estimators) == 3
assert estimator_dropped.named_estimators.svm == "drop"
assert len(estimator_dropped.named_estimators_) == 3
assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
["lr", "svm", "rf"]
)
for sub_est in estimator_dropped.named_estimators_:
# check that the correspondence is correct
assert not isinstance(sub_est, type(estimator.named_estimators.svm))
# check that we can set the parameters of the underlying classifier
estimator.set_params(svm__C=10.0)
estimator.set_params(rf__max_depth=5)
assert (
estimator.get_params()["svm__C"]
== estimator.get_params()["svm"].get_params()["C"]
)
assert (
estimator.get_params()["rf__max_depth"]
== estimator.get_params()["rf"].get_params()["max_depth"]
)
@pytest.mark.parametrize(
"Ensemble",
[VotingClassifier, StackingRegressor, VotingRegressor],
)
def test_ensemble_heterogeneous_estimators_type(Ensemble):
# check that ensemble will fail during validation if the underlying
# estimators are not of the same type (i.e. classifier or regressor)
# StackingClassifier can have an underlying regresor so it's not checked
if issubclass(Ensemble, ClassifierMixin):
X, y = make_classification(n_samples=10)
estimators = [("lr", LinearRegression())]
ensemble_type = "classifier"
else:
X, y = make_regression(n_samples=10)
estimators = [("lr", LogisticRegression())]
ensemble_type = "regressor"
ensemble = Ensemble(estimators=estimators)
err_msg = "should be a {}".format(ensemble_type)
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
@pytest.mark.parametrize(
"X, y, Ensemble",
[
(*make_classification(n_samples=10), StackingClassifier),
(*make_classification(n_samples=10), VotingClassifier),
(*make_regression(n_samples=10), StackingRegressor),
(*make_regression(n_samples=10), VotingRegressor),
],
)
def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
# raise an error when the name contains dunder
if issubclass(Ensemble, ClassifierMixin):
estimators = [("lr__", LogisticRegression())]
else:
estimators = [("lr__", LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = r"Estimator names must not contain __: got \['lr__'\]"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
# raise an error when the name is not unique
if issubclass(Ensemble, ClassifierMixin):
estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
else:
estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
# raise an error when the name conflicts with the parameters
if issubclass(Ensemble, ClassifierMixin):
estimators = [("estimators", LogisticRegression())]
else:
estimators = [("estimators", LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = "Estimator names conflict with constructor arguments"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
@pytest.mark.parametrize(
"X, y, estimator",
[
(
*make_classification(n_samples=10),
StackingClassifier(estimators=[("lr", LogisticRegression())]),
),
(
*make_classification(n_samples=10),
VotingClassifier(estimators=[("lr", LogisticRegression())]),
),
(
*make_regression(n_samples=10),
StackingRegressor(estimators=[("lr", LinearRegression())]),
),
(
*make_regression(n_samples=10),
VotingRegressor(estimators=[("lr", LinearRegression())]),
),
],
ids=[
"stacking-classifier",
"voting-classifier",
"stacking-regressor",
"voting-regressor",
],
)
def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
# check that we raise a consistent error when all estimators are
# dropped
estimator.set_params(lr="drop")
with pytest.raises(ValueError, match="All estimators are dropped."):
estimator.fit(X, y)
@pytest.mark.parametrize(
"Ensemble, Estimator, X, y",
[
(StackingClassifier, LogisticRegression, X, y),
(StackingRegressor, LinearRegression, X_r, y_r),
(VotingClassifier, LogisticRegression, X, y),
(VotingRegressor, LinearRegression, X_r, y_r),
],
)
# FIXME: we should move this test in `estimator_checks` once we are able
# to construct meta-estimator instances
def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
# check that Voting and Stacking predictor delegate the missing values
# validation to the underlying estimator.
X = X.copy()
mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
X[mask] = np.nan
pipe = make_pipeline(SimpleImputer(), Estimator())
ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
ensemble.fit(X, y).score(X, y)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,363 @@
"""
Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
"""
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# License: BSD 3 clause
import warnings
from unittest.mock import Mock, patch
import numpy as np
import pytest
from sklearn.datasets import load_diabetes, load_iris, make_classification
from sklearn.ensemble import IsolationForest
from sklearn.ensemble._iforest import _average_path_length
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.utils import check_random_state
from sklearn.utils._testing import (
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
ignore_warnings,
)
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
# load iris & diabetes dataset
iris = load_iris()
diabetes = load_diabetes()
def test_iforest(global_random_seed):
"""Check Isolation Forest for various parameter settings."""
X_train = np.array([[0, 1], [1, 2]])
X_test = np.array([[2, 1], [1, 1]])
grid = ParameterGrid(
{"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]}
)
with ignore_warnings():
for params in grid:
IsolationForest(random_state=global_random_seed, **params).fit(
X_train
).predict(X_test)
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
def test_iforest_sparse(global_random_seed, sparse_container):
"""Check IForest for various parameter settings on sparse input."""
rng = check_random_state(global_random_seed)
X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]})
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
for params in grid:
# Trained on sparse format
sparse_classifier = IsolationForest(
n_estimators=10, random_state=global_random_seed, **params
).fit(X_train_sparse)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_classifier = IsolationForest(
n_estimators=10, random_state=global_random_seed, **params
).fit(X_train)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
def test_iforest_error():
"""Test that it gives proper exception on deficient input."""
X = iris.data
# The dataset has less than 256 samples, explicitly setting
# max_samples > n_samples should result in a warning. If not set
# explicitly there should be no warning
warn_msg = "max_samples will be set to n_samples for estimation"
with pytest.warns(UserWarning, match=warn_msg):
IsolationForest(max_samples=1000).fit(X)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
IsolationForest(max_samples="auto").fit(X)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
IsolationForest(max_samples=np.int64(2)).fit(X)
# test X_test n_features match X_train one:
with pytest.raises(ValueError):
IsolationForest().fit(X).predict(X[:, 1:])
def test_recalculate_max_depth():
"""Check max_depth recalculation when max_samples is reset to n_samples"""
X = iris.data
clf = IsolationForest().fit(X)
for est in clf.estimators_:
assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
def test_max_samples_attribute():
X = iris.data
clf = IsolationForest().fit(X)
assert clf.max_samples_ == X.shape[0]
clf = IsolationForest(max_samples=500)
warn_msg = "max_samples will be set to n_samples for estimation"
with pytest.warns(UserWarning, match=warn_msg):
clf.fit(X)
assert clf.max_samples_ == X.shape[0]
clf = IsolationForest(max_samples=0.4).fit(X)
assert clf.max_samples_ == 0.4 * X.shape[0]
def test_iforest_parallel_regression(global_random_seed):
"""Check parallel regression."""
rng = check_random_state(global_random_seed)
X_train, X_test = train_test_split(diabetes.data, random_state=rng)
ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)
ensemble.set_params(n_jobs=1)
y1 = ensemble.predict(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y2)
ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train)
y3 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y3)
def test_iforest_performance(global_random_seed):
"""Test Isolation Forest performs well"""
# Generate train/test data
rng = check_random_state(global_random_seed)
X = 0.3 * rng.randn(600, 2)
X = rng.permutation(np.vstack((X + 2, X - 2)))
X_train = X[:1000]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-1, high=1, size=(200, 2))
X_test = np.vstack((X[1000:], X_outliers))
y_test = np.array([0] * 200 + [1] * 200)
# fit the model
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = -clf.decision_function(X_test)
# check that there is at most 6 errors (false positive or false negative)
assert roc_auc_score(y_test, y_pred) > 0.98
@pytest.mark.parametrize("contamination", [0.25, "auto"])
def test_iforest_works(contamination, global_random_seed):
# toy sample (the last two samples are outliers)
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
# Test IsolationForest
clf = IsolationForest(random_state=global_random_seed, contamination=contamination)
clf.fit(X)
decision_func = -clf.decision_function(X)
pred = clf.predict(X)
# assert detect outliers:
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
assert_array_equal(pred, 6 * [1] + 2 * [-1])
def test_max_samples_consistency():
# Make sure validated max_samples in iforest and BaseBagging are identical
X = iris.data
clf = IsolationForest().fit(X)
assert clf.max_samples_ == clf._max_samples
def test_iforest_subsampled_features():
# It tests non-regression for #5732 which failed at predict.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data[:50], diabetes.target[:50], random_state=rng
)
clf = IsolationForest(max_features=0.8)
clf.fit(X_train, y_train)
clf.predict(X_test)
def test_iforest_average_path_length():
# It tests non-regression for #8549 which used the wrong formula
# for average path length, strictly for the integer case
# Updated to check average path length when input is <= 2 (issue #11839)
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
assert_allclose(_average_path_length([0]), [0.0])
assert_allclose(_average_path_length([1]), [0.0])
assert_allclose(_average_path_length([2]), [1.0])
assert_allclose(_average_path_length([5]), [result_one])
assert_allclose(_average_path_length([999]), [result_two])
assert_allclose(
_average_path_length(np.array([1, 2, 5, 999])),
[0.0, 1.0, result_one, result_two],
)
# _average_path_length is increasing
avg_path_length = _average_path_length(np.arange(5))
assert_array_equal(avg_path_length, np.sort(avg_path_length))
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = IsolationForest(contamination=0.1).fit(X_train)
clf2 = IsolationForest().fit(X_train)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]),
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
)
assert_array_equal(
clf2.score_samples([[2.0, 2.0]]),
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
)
def test_iforest_warm_start():
"""Test iterative addition of iTrees to an iForest"""
rng = check_random_state(0)
X = rng.randn(20, 2)
# fit first 10 trees
clf = IsolationForest(
n_estimators=10, max_samples=20, random_state=rng, warm_start=True
)
clf.fit(X)
# remember the 1st tree
tree_1 = clf.estimators_[0]
# fit another 10 trees
clf.set_params(n_estimators=20)
clf.fit(X)
# expecting 20 fitted trees and no overwritten trees
assert len(clf.estimators_) == 20
assert clf.estimators_[0] is tree_1
# mock get_chunk_n_rows to actually test more than one chunk (here one
# chunk has 3 rows):
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 3}),
)
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
def test_iforest_chunks_works1(
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
):
test_iforest_works(contamination, global_random_seed)
assert mocked_get_chunk.call_count == n_predict_calls
# idem with chunk_size = 10 rows
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 10}),
)
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
def test_iforest_chunks_works2(
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
):
test_iforest_works(contamination, global_random_seed)
assert mocked_get_chunk.call_count == n_predict_calls
def test_iforest_with_uniform_data():
"""Test whether iforest predicts inliers when using uniform data"""
# 2-d array of all 1s
X = np.ones((100, 10))
iforest = IsolationForest()
iforest.fit(X)
rng = np.random.RandomState(0)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(X + 1) == 1)
assert all(iforest.predict(X - 1) == 1)
# 2-d array where columns contain the same value across rows
X = np.repeat(rng.randn(1, 10), 100, 0)
iforest = IsolationForest()
iforest.fit(X)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)
# Single row
X = rng.randn(1, 10)
iforest = IsolationForest()
iforest.fit(X)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_iforest_with_n_jobs_does_not_segfault(csc_container):
"""Check that Isolation Forest does not segfault with n_jobs=2
Non-regression test for #23252
"""
X, _ = make_classification(n_samples=85_000, n_features=100, random_state=0)
X = csc_container(X)
IsolationForest(n_estimators=10, max_samples=256, n_jobs=2).fit(X)
def test_iforest_preserve_feature_names():
"""Check that feature names are preserved when contamination is not "auto".
Feature names are required for consistency checks during scoring.
Non-regression test for Issue #25844
"""
pd = pytest.importorskip("pandas")
rng = np.random.RandomState(0)
X = pd.DataFrame(data=rng.randn(4), columns=["a"])
model = IsolationForest(random_state=0, contamination=0.05)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
model.fit(X)
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
def test_iforest_sparse_input_float_contamination(sparse_container):
"""Check that `IsolationForest` accepts sparse matrix input and float value for
contamination.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27626
"""
X, _ = make_classification(n_samples=50, n_features=4, random_state=0)
X = sparse_container(X)
X.sort_indices()
contamination = 0.1
iforest = IsolationForest(
n_estimators=5, contamination=contamination, random_state=0
).fit(X)
X_decision = iforest.decision_function(X)
assert (X_decision < 0).sum() / X.shape[0] == pytest.approx(contamination)

View File

@ -0,0 +1,890 @@
"""Test the stacking classifier and regressor."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: BSD 3 clause
from unittest.mock import Mock
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from scipy import sparse
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
from sklearn.datasets import (
load_breast_cancer,
load_diabetes,
load_iris,
make_classification,
make_multilabel_classification,
make_regression,
)
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import (
RandomForestClassifier,
RandomForestRegressor,
StackingClassifier,
StackingRegressor,
)
from sklearn.exceptions import ConvergenceWarning, NotFittedError
from sklearn.linear_model import (
LinearRegression,
LogisticRegression,
Ridge,
RidgeClassifier,
)
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import scale
from sklearn.svm import SVC, LinearSVC, LinearSVR
from sklearn.utils._mocking import CheckingClassifier
from sklearn.utils._testing import (
assert_allclose,
assert_allclose_dense_sparse,
ignore_warnings,
)
from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
diabetes = load_diabetes()
X_diabetes, y_diabetes = diabetes.data, diabetes.target
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
X_multilabel, y_multilabel = make_multilabel_classification(
n_classes=3, random_state=42
)
X_binary, y_binary = make_classification(n_classes=2, random_state=42)
@pytest.mark.parametrize(
"cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]
)
@pytest.mark.parametrize(
"final_estimator", [None, RandomForestClassifier(random_state=42)]
)
@pytest.mark.parametrize("passthrough", [False, True])
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
# prescale the data to avoid convergence warning without using a pipeline
# for later assert
X_train, X_test, y_train, y_test = train_test_split(
scale(X_iris), y_iris, stratify=y_iris, random_state=42
)
estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
clf = StackingClassifier(
estimators=estimators,
final_estimator=final_estimator,
cv=cv,
passthrough=passthrough,
)
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.predict_proba(X_test)
assert clf.score(X_test, y_test) > 0.8
X_trans = clf.transform(X_test)
expected_column_count = 10 if passthrough else 6
assert X_trans.shape[1] == expected_column_count
if passthrough:
assert_allclose(X_test, X_trans[:, -4:])
clf.set_params(lr="drop")
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.predict_proba(X_test)
if final_estimator is None:
# LogisticRegression has decision_function method
clf.decision_function(X_test)
X_trans = clf.transform(X_test)
expected_column_count_drop = 7 if passthrough else 3
assert X_trans.shape[1] == expected_column_count_drop
if passthrough:
assert_allclose(X_test, X_trans[:, -4:])
def test_stacking_classifier_drop_column_binary_classification():
# check that a column is dropped in binary classification
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, _ = train_test_split(
scale(X), y, stratify=y, random_state=42
)
# both classifiers implement 'predict_proba' and will both drop one column
estimators = [
("lr", LogisticRegression()),
("rf", RandomForestClassifier(random_state=42)),
]
clf = StackingClassifier(estimators=estimators, cv=3)
clf.fit(X_train, y_train)
X_trans = clf.transform(X_test)
assert X_trans.shape[1] == 2
# LinearSVC does not implement 'predict_proba' and will not drop one column
estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
clf.set_params(estimators=estimators)
clf.fit(X_train, y_train)
X_trans = clf.transform(X_test)
assert X_trans.shape[1] == 2
def test_stacking_classifier_drop_estimator():
# prescale the data to avoid convergence warning without using a pipeline
# for later assert
X_train, X_test, y_train, _ = train_test_split(
scale(X_iris), y_iris, stratify=y_iris, random_state=42
)
estimators = [("lr", "drop"), ("svc", LinearSVC(random_state=0))]
rf = RandomForestClassifier(n_estimators=10, random_state=42)
clf = StackingClassifier(
estimators=[("svc", LinearSVC(random_state=0))],
final_estimator=rf,
cv=5,
)
clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5)
clf.fit(X_train, y_train)
clf_drop.fit(X_train, y_train)
assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
def test_stacking_regressor_drop_estimator():
# prescale the data to avoid convergence warning without using a pipeline
# for later assert
X_train, X_test, y_train, _ = train_test_split(
scale(X_diabetes), y_diabetes, random_state=42
)
estimators = [("lr", "drop"), ("svr", LinearSVR(random_state=0))]
rf = RandomForestRegressor(n_estimators=10, random_state=42)
reg = StackingRegressor(
estimators=[("svr", LinearSVR(random_state=0))],
final_estimator=rf,
cv=5,
)
reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5)
reg.fit(X_train, y_train)
reg_drop.fit(X_train, y_train)
assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
@pytest.mark.parametrize("cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)])
@pytest.mark.parametrize(
"final_estimator, predict_params",
[
(None, {}),
(RandomForestRegressor(random_state=42), {}),
(DummyRegressor(), {"return_std": True}),
],
)
@pytest.mark.parametrize("passthrough", [False, True])
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough):
# prescale the data to avoid convergence warning without using a pipeline
# for later assert
X_train, X_test, y_train, _ = train_test_split(
scale(X_diabetes), y_diabetes, random_state=42
)
estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
reg = StackingRegressor(
estimators=estimators,
final_estimator=final_estimator,
cv=cv,
passthrough=passthrough,
)
reg.fit(X_train, y_train)
result = reg.predict(X_test, **predict_params)
expected_result_length = 2 if predict_params else 1
if predict_params:
assert len(result) == expected_result_length
X_trans = reg.transform(X_test)
expected_column_count = 12 if passthrough else 2
assert X_trans.shape[1] == expected_column_count
if passthrough:
assert_allclose(X_test, X_trans[:, -10:])
reg.set_params(lr="drop")
reg.fit(X_train, y_train)
reg.predict(X_test)
X_trans = reg.transform(X_test)
expected_column_count_drop = 11 if passthrough else 1
assert X_trans.shape[1] == expected_column_count_drop
if passthrough:
assert_allclose(X_test, X_trans[:, -10:])
@pytest.mark.parametrize(
"sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
)
def test_stacking_regressor_sparse_passthrough(sparse_container):
# Check passthrough behavior on a sparse X matrix
X_train, X_test, y_train, _ = train_test_split(
sparse_container(scale(X_diabetes)), y_diabetes, random_state=42
)
estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
rf = RandomForestRegressor(n_estimators=10, random_state=42)
clf = StackingRegressor(
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
)
clf.fit(X_train, y_train)
X_trans = clf.transform(X_test)
assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
assert sparse.issparse(X_trans)
assert X_test.format == X_trans.format
@pytest.mark.parametrize(
"sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
)
def test_stacking_classifier_sparse_passthrough(sparse_container):
# Check passthrough behavior on a sparse X matrix
X_train, X_test, y_train, _ = train_test_split(
sparse_container(scale(X_iris)), y_iris, random_state=42
)
estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
rf = RandomForestClassifier(n_estimators=10, random_state=42)
clf = StackingClassifier(
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
)
clf.fit(X_train, y_train)
X_trans = clf.transform(X_test)
assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
assert sparse.issparse(X_trans)
assert X_test.format == X_trans.format
def test_stacking_classifier_drop_binary_prob():
# check that classifier will drop one of the probability column for
# binary classification problem
# Select only the 2 first classes
X_, y_ = scale(X_iris[:100]), y_iris[:100]
estimators = [("lr", LogisticRegression()), ("rf", RandomForestClassifier())]
clf = StackingClassifier(estimators=estimators)
clf.fit(X_, y_)
X_meta = clf.transform(X_)
assert X_meta.shape[1] == 2
class NoWeightRegressor(RegressorMixin, BaseEstimator):
def fit(self, X, y):
self.reg = DummyRegressor()
return self.reg.fit(X, y)
def predict(self, X):
return np.ones(X.shape[0])
class NoWeightClassifier(ClassifierMixin, BaseEstimator):
def fit(self, X, y):
self.clf = DummyClassifier(strategy="stratified")
return self.clf.fit(X, y)
@pytest.mark.parametrize(
"y, params, type_err, msg_err",
[
(y_iris, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
(
y_iris,
{
"estimators": [
("lr", LogisticRegression()),
("svm", SVC(max_iter=50_000)),
],
"stack_method": "predict_proba",
},
ValueError,
"does not implement the method predict_proba",
),
(
y_iris,
{
"estimators": [
("lr", LogisticRegression()),
("cor", NoWeightClassifier()),
]
},
TypeError,
"does not support sample weight",
),
(
y_iris,
{
"estimators": [
("lr", LogisticRegression()),
("cor", LinearSVC(max_iter=50_000)),
],
"final_estimator": NoWeightClassifier(),
},
TypeError,
"does not support sample weight",
),
],
)
def test_stacking_classifier_error(y, params, type_err, msg_err):
with pytest.raises(type_err, match=msg_err):
clf = StackingClassifier(**params, cv=3)
clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))
@pytest.mark.parametrize(
"y, params, type_err, msg_err",
[
(y_diabetes, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
(
y_diabetes,
{"estimators": [("lr", LinearRegression()), ("cor", NoWeightRegressor())]},
TypeError,
"does not support sample weight",
),
(
y_diabetes,
{
"estimators": [
("lr", LinearRegression()),
("cor", LinearSVR()),
],
"final_estimator": NoWeightRegressor(),
},
TypeError,
"does not support sample weight",
),
],
)
def test_stacking_regressor_error(y, params, type_err, msg_err):
with pytest.raises(type_err, match=msg_err):
reg = StackingRegressor(**params, cv=3)
reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0]))
@pytest.mark.parametrize(
"estimator, X, y",
[
(
StackingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("svm", LinearSVC(random_state=0)),
]
),
X_iris[:100],
y_iris[:100],
), # keep only classes 0 and 1
(
StackingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", LinearSVR(random_state=0)),
]
),
X_diabetes,
y_diabetes,
),
],
ids=["StackingClassifier", "StackingRegressor"],
)
def test_stacking_randomness(estimator, X, y):
# checking that fixing the random state of the CV will lead to the same
# results
estimator_full = clone(estimator)
estimator_full.set_params(
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
)
estimator_drop = clone(estimator)
estimator_drop.set_params(lr="drop")
estimator_drop.set_params(
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
)
assert_allclose(
estimator_full.fit(X, y).transform(X)[:, 1:],
estimator_drop.fit(X, y).transform(X),
)
def test_stacking_classifier_stratify_default():
# check that we stratify the classes for the default CV
clf = StackingClassifier(
estimators=[
("lr", LogisticRegression(max_iter=10_000)),
("svm", LinearSVC(max_iter=10_000)),
]
)
# since iris is not shuffled, a simple k-fold would not contain the
# 3 classes during training
clf.fit(X_iris, y_iris)
@pytest.mark.parametrize(
"stacker, X, y",
[
(
StackingClassifier(
estimators=[
("lr", LogisticRegression()),
("svm", LinearSVC(random_state=42)),
],
final_estimator=LogisticRegression(),
cv=KFold(shuffle=True, random_state=42),
),
*load_breast_cancer(return_X_y=True),
),
(
StackingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", LinearSVR(random_state=42)),
],
final_estimator=LinearRegression(),
cv=KFold(shuffle=True, random_state=42),
),
X_diabetes,
y_diabetes,
),
],
ids=["StackingClassifier", "StackingRegressor"],
)
def test_stacking_with_sample_weight(stacker, X, y):
# check that sample weights has an influence on the fitting
# note: ConvergenceWarning are catch since we are not worrying about the
# convergence here
n_half_samples = len(y) // 2
total_sample_weight = np.array(
[0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)
)
X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split(
X, y, total_sample_weight, random_state=42
)
with ignore_warnings(category=ConvergenceWarning):
stacker.fit(X_train, y_train)
y_pred_no_weight = stacker.predict(X_test)
with ignore_warnings(category=ConvergenceWarning):
stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape))
y_pred_unit_weight = stacker.predict(X_test)
assert_allclose(y_pred_no_weight, y_pred_unit_weight)
with ignore_warnings(category=ConvergenceWarning):
stacker.fit(X_train, y_train, sample_weight=sample_weight_train)
y_pred_biased = stacker.predict(X_test)
assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
def test_stacking_classifier_sample_weight_fit_param():
# check sample_weight is passed to all invocations of fit
stacker = StackingClassifier(
estimators=[("lr", CheckingClassifier(expected_sample_weight=True))],
final_estimator=CheckingClassifier(expected_sample_weight=True),
)
stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0]))
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
@pytest.mark.parametrize(
"stacker, X, y",
[
(
StackingClassifier(
estimators=[
("lr", LogisticRegression()),
("svm", LinearSVC(random_state=42)),
],
final_estimator=LogisticRegression(),
),
*load_breast_cancer(return_X_y=True),
),
(
StackingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", LinearSVR(random_state=42)),
],
final_estimator=LinearRegression(),
),
X_diabetes,
y_diabetes,
),
],
ids=["StackingClassifier", "StackingRegressor"],
)
def test_stacking_cv_influence(stacker, X, y):
# check that the stacking affects the fit of the final estimator but not
# the fit of the base estimators
# note: ConvergenceWarning are catch since we are not worrying about the
# convergence here
stacker_cv_3 = clone(stacker)
stacker_cv_5 = clone(stacker)
stacker_cv_3.set_params(cv=3)
stacker_cv_5.set_params(cv=5)
stacker_cv_3.fit(X, y)
stacker_cv_5.fit(X, y)
# the base estimators should be identical
for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_, stacker_cv_5.estimators_):
assert_allclose(est_cv_3.coef_, est_cv_5.coef_)
# the final estimator should be different
with pytest.raises(AssertionError, match="Not equal"):
assert_allclose(
stacker_cv_3.final_estimator_.coef_, stacker_cv_5.final_estimator_.coef_
)
@pytest.mark.parametrize(
"Stacker, Estimator, stack_method, final_estimator, X, y",
[
(
StackingClassifier,
DummyClassifier,
"predict_proba",
LogisticRegression(random_state=42),
X_iris,
y_iris,
),
(
StackingRegressor,
DummyRegressor,
"predict",
LinearRegression(),
X_diabetes,
y_diabetes,
),
],
)
def test_stacking_prefit(Stacker, Estimator, stack_method, final_estimator, X, y):
"""Check the behaviour of stacking when `cv='prefit'`"""
X_train1, X_train2, y_train1, y_train2 = train_test_split(
X, y, random_state=42, test_size=0.5
)
estimators = [
("d0", Estimator().fit(X_train1, y_train1)),
("d1", Estimator().fit(X_train1, y_train1)),
]
# mock out fit and stack_method to be asserted later
for _, estimator in estimators:
estimator.fit = Mock(name="fit")
stack_func = getattr(estimator, stack_method)
predict_method_mocked = Mock(side_effect=stack_func)
# Mocking a method will not provide a `__name__` while Python methods
# do and we are using it in `_get_response_method`.
predict_method_mocked.__name__ = stack_method
setattr(estimator, stack_method, predict_method_mocked)
stacker = Stacker(
estimators=estimators, cv="prefit", final_estimator=final_estimator
)
stacker.fit(X_train2, y_train2)
assert stacker.estimators_ == [estimator for _, estimator in estimators]
# fit was not called again
assert all(estimator.fit.call_count == 0 for estimator in stacker.estimators_)
# stack method is called with the proper inputs
for estimator in stacker.estimators_:
stack_func_mock = getattr(estimator, stack_method)
stack_func_mock.assert_called_with(X_train2)
@pytest.mark.parametrize(
"stacker, X, y",
[
(
StackingClassifier(
estimators=[("lr", LogisticRegression()), ("svm", SVC())],
cv="prefit",
),
X_iris,
y_iris,
),
(
StackingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", LinearSVR()),
],
cv="prefit",
),
X_diabetes,
y_diabetes,
),
],
)
def test_stacking_prefit_error(stacker, X, y):
# check that NotFittedError is raised
# if base estimators are not fitted when cv="prefit"
with pytest.raises(NotFittedError):
stacker.fit(X, y)
@pytest.mark.parametrize(
"make_dataset, Stacking, Estimator",
[
(make_classification, StackingClassifier, LogisticRegression),
(make_regression, StackingRegressor, LinearRegression),
],
)
def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
# Stacking supports estimators without `n_features_in_`. Regression test
# for #17353
class MyEstimator(Estimator):
"""Estimator without n_features_in_"""
def fit(self, X, y):
super().fit(X, y)
del self.n_features_in_
X, y = make_dataset(random_state=0, n_samples=100)
stacker = Stacking(estimators=[("lr", MyEstimator())])
msg = f"{Stacking.__name__} object has no attribute n_features_in_"
with pytest.raises(AttributeError, match=msg):
stacker.n_features_in_
# Does not raise
stacker.fit(X, y)
msg = "'MyEstimator' object has no attribute 'n_features_in_'"
with pytest.raises(AttributeError, match=msg):
stacker.n_features_in_
@pytest.mark.parametrize(
"estimator",
[
# output a 2D array of the probability of the positive class for each output
MLPClassifier(random_state=42),
# output a list of 2D array containing the probability of each class
# for each output
RandomForestClassifier(random_state=42),
],
ids=["MLPClassifier", "RandomForestClassifier"],
)
def test_stacking_classifier_multilabel_predict_proba(estimator):
"""Check the behaviour for the multilabel classification case and the
`predict_proba` stacking method.
Estimators are not consistent with the output arrays and we need to ensure that
we handle all cases.
"""
X_train, X_test, y_train, y_test = train_test_split(
X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
)
n_outputs = 3
estimators = [("est", estimator)]
stacker = StackingClassifier(
estimators=estimators,
final_estimator=KNeighborsClassifier(),
stack_method="predict_proba",
).fit(X_train, y_train)
X_trans = stacker.transform(X_test)
assert X_trans.shape == (X_test.shape[0], n_outputs)
# we should not have any collinear classes and thus nothing should sum to 1
assert not any(np.isclose(X_trans.sum(axis=1), 1.0))
y_pred = stacker.predict(X_test)
assert y_pred.shape == y_test.shape
def test_stacking_classifier_multilabel_decision_function():
"""Check the behaviour for the multilabel classification case and the
`decision_function` stacking method. Only `RidgeClassifier` supports this
case.
"""
X_train, X_test, y_train, y_test = train_test_split(
X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
)
n_outputs = 3
estimators = [("est", RidgeClassifier())]
stacker = StackingClassifier(
estimators=estimators,
final_estimator=KNeighborsClassifier(),
stack_method="decision_function",
).fit(X_train, y_train)
X_trans = stacker.transform(X_test)
assert X_trans.shape == (X_test.shape[0], n_outputs)
y_pred = stacker.predict(X_test)
assert y_pred.shape == y_test.shape
@pytest.mark.parametrize("stack_method", ["auto", "predict"])
@pytest.mark.parametrize("passthrough", [False, True])
def test_stacking_classifier_multilabel_auto_predict(stack_method, passthrough):
"""Check the behaviour for the multilabel classification case for stack methods
supported for all estimators or automatically picked up.
"""
X_train, X_test, y_train, y_test = train_test_split(
X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
)
y_train_before_fit = y_train.copy()
n_outputs = 3
estimators = [
("mlp", MLPClassifier(random_state=42)),
("rf", RandomForestClassifier(random_state=42)),
("ridge", RidgeClassifier()),
]
final_estimator = KNeighborsClassifier()
clf = StackingClassifier(
estimators=estimators,
final_estimator=final_estimator,
passthrough=passthrough,
stack_method=stack_method,
).fit(X_train, y_train)
# make sure we don't change `y_train` inplace
assert_array_equal(y_train_before_fit, y_train)
y_pred = clf.predict(X_test)
assert y_pred.shape == y_test.shape
if stack_method == "auto":
expected_stack_methods = ["predict_proba", "predict_proba", "decision_function"]
else:
expected_stack_methods = ["predict"] * len(estimators)
assert clf.stack_method_ == expected_stack_methods
n_features_X_trans = n_outputs * len(estimators)
if passthrough:
n_features_X_trans += X_train.shape[1]
X_trans = clf.transform(X_test)
assert X_trans.shape == (X_test.shape[0], n_features_X_trans)
assert_array_equal(clf.classes_, [np.array([0, 1])] * n_outputs)
@pytest.mark.parametrize(
"stacker, feature_names, X, y, expected_names",
[
(
StackingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("svm", LinearSVC(random_state=0)),
]
),
iris.feature_names,
X_iris,
y_iris,
[
"stackingclassifier_lr0",
"stackingclassifier_lr1",
"stackingclassifier_lr2",
"stackingclassifier_svm0",
"stackingclassifier_svm1",
"stackingclassifier_svm2",
],
),
(
StackingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("other", "drop"),
("svm", LinearSVC(random_state=0)),
]
),
iris.feature_names,
X_iris[:100],
y_iris[:100], # keep only classes 0 and 1
[
"stackingclassifier_lr",
"stackingclassifier_svm",
],
),
(
StackingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", LinearSVR(random_state=0)),
]
),
diabetes.feature_names,
X_diabetes,
y_diabetes,
[
"stackingregressor_lr",
"stackingregressor_svm",
],
),
],
ids=[
"StackingClassifier_multiclass",
"StackingClassifier_binary",
"StackingRegressor",
],
)
@pytest.mark.parametrize("passthrough", [True, False])
def test_get_feature_names_out(
stacker, feature_names, X, y, expected_names, passthrough
):
"""Check get_feature_names_out works for stacking."""
stacker.set_params(passthrough=passthrough)
stacker.fit(scale(X), y)
if passthrough:
expected_names = np.concatenate((expected_names, feature_names))
names_out = stacker.get_feature_names_out(feature_names)
assert_array_equal(names_out, expected_names)
def test_stacking_classifier_base_regressor():
"""Check that a regressor can be used as the first layer in `StackingClassifier`."""
X_train, X_test, y_train, y_test = train_test_split(
scale(X_iris), y_iris, stratify=y_iris, random_state=42
)
clf = StackingClassifier(estimators=[("ridge", Ridge())])
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.predict_proba(X_test)
assert clf.score(X_test, y_test) > 0.8
def test_stacking_final_estimator_attribute_error():
"""Check that we raise the proper AttributeError when the final estimator
does not implement the `decision_function` method, which is decorated with
`available_if`.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/28108
"""
X, y = make_classification(random_state=42)
estimators = [
("lr", LogisticRegression()),
("rf", RandomForestClassifier(n_estimators=2, random_state=42)),
]
# RandomForestClassifier does not implement 'decision_function' and should raise
# an AttributeError
final_estimator = RandomForestClassifier(n_estimators=2, random_state=42)
clf = StackingClassifier(
estimators=estimators, final_estimator=final_estimator, cv=3
)
outer_msg = "This 'StackingClassifier' has no attribute 'decision_function'"
inner_msg = "'RandomForestClassifier' object has no attribute 'decision_function'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
clf.fit(X, y).decision_function(X)
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)

View File

@ -0,0 +1,788 @@
"""Testing for the VotingClassifier and VotingRegressor"""
import re
import numpy as np
import pytest
from sklearn import datasets
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.datasets import make_multilabel_classification
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
RandomForestClassifier,
RandomForestRegressor,
VotingClassifier,
VotingRegressor,
)
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tests.metadata_routing_common import (
ConsumingClassifier,
ConsumingRegressor,
_Registry,
check_recorded_metadata,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
ignore_warnings,
)
# Load datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
# Scaled to solve ConvergenceWarning throw by Logistic Regression
X_scaled = StandardScaler().fit_transform(X)
X_r, y_r = datasets.load_diabetes(return_X_y=True)
@pytest.mark.parametrize(
"params, err_msg",
[
(
{"estimators": []},
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
),
(
{"estimators": [("lr", LogisticRegression())], "weights": [1, 2]},
"Number of `estimators` and weights must be equal",
),
],
)
def test_voting_classifier_estimator_init(params, err_msg):
ensemble = VotingClassifier(**params)
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
def test_predictproba_hardvoting():
eclf = VotingClassifier(
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
voting="hard",
)
inner_msg = "predict_proba is not available when voting='hard'"
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
eclf.predict_proba
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
assert not hasattr(eclf, "predict_proba")
eclf.fit(X_scaled, y)
assert not hasattr(eclf, "predict_proba")
def test_notfitted():
eclf = VotingClassifier(
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
voting="soft",
)
ereg = VotingRegressor([("dr", DummyRegressor())])
msg = (
"This %s instance is not fitted yet. Call 'fit'"
" with appropriate arguments before using this estimator."
)
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
eclf.predict(X)
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
eclf.predict_proba(X)
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
eclf.transform(X)
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
ereg.predict(X_r)
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
ereg.transform(X_r)
def test_majority_label_iris(global_random_seed):
"""Check classification by majority label on dataset iris."""
clf1 = LogisticRegression(solver="liblinear", random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
)
scores = cross_val_score(eclf, X, y, scoring="accuracy")
assert scores.mean() >= 0.9
def test_tie_situation():
"""Check voting classifier selects smaller class label in tie situation."""
clf1 = LogisticRegression(random_state=123, solver="liblinear")
clf2 = RandomForestClassifier(random_state=123)
eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
assert clf1.fit(X, y).predict(X)[73] == 2
assert clf2.fit(X, y).predict(X)[73] == 1
assert eclf.fit(X, y).predict(X)[73] == 1
def test_weights_iris(global_random_seed):
"""Check classification by average probabilities on dataset iris."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
weights=[1, 2, 10],
)
scores = cross_val_score(eclf, X_scaled, y, scoring="accuracy")
assert scores.mean() >= 0.9
def test_weights_regressor():
"""Check weighted average regression prediction on diabetes dataset."""
reg1 = DummyRegressor(strategy="mean")
reg2 = DummyRegressor(strategy="median")
reg3 = DummyRegressor(strategy="quantile", quantile=0.2)
ereg = VotingRegressor(
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10]
)
X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
X_r, y_r, test_size=0.25
)
reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
avg = np.average(
np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]
)
assert_almost_equal(ereg_pred, avg, decimal=2)
ereg_weights_none = VotingRegressor(
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None
)
ereg_weights_equal = VotingRegressor(
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1]
)
ereg_weights_none.fit(X_r_train, y_r_train)
ereg_weights_equal.fit(X_r_train, y_r_train)
ereg_none_pred = ereg_weights_none.predict(X_r_test)
ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
def test_predict_on_toy_problem(global_random_seed):
"""Manually check predicted class labels for toy dataset."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
X = np.array(
[[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]
)
y = np.array([1, 1, 1, 2, 2, 2])
assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="hard",
weights=[1, 1, 1],
)
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
weights=[1, 1, 1],
)
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
def test_predict_proba_on_toy_problem():
"""Calculate predicted probabilities on toy dataset."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
clf1_res = np.array(
[
[0.59790391, 0.40209609],
[0.57622162, 0.42377838],
[0.50728456, 0.49271544],
[0.40241774, 0.59758226],
]
)
clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]])
clf3_res = np.array(
[[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0.0, 1.0], [0.0, 1.0]]
)
t00 = (2 * clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
t11 = (2 * clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
t21 = (2 * clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
t31 = (2 * clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
weights=[2, 1, 1],
)
eclf_res = eclf.fit(X, y).predict_proba(X)
assert_almost_equal(t00, eclf_res[0][0], decimal=1)
assert_almost_equal(t11, eclf_res[1][1], decimal=1)
assert_almost_equal(t21, eclf_res[2][1], decimal=1)
assert_almost_equal(t31, eclf_res[3][1], decimal=1)
inner_msg = "predict_proba is not available when voting='hard'"
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
)
eclf.fit(X, y).predict_proba(X)
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
def test_multilabel():
"""Check if error is raised for multilabel classification."""
X, y = make_multilabel_classification(
n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123
)
clf = OneVsRestClassifier(SVC(kernel="linear"))
eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")
try:
eclf.fit(X, y)
except NotImplementedError:
return
def test_gridsearch():
"""Check GridSearch support."""
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1, n_estimators=3)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
)
params = {
"lr__C": [1.0, 100.0],
"voting": ["soft", "hard"],
"weights": [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]],
}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2)
grid.fit(X_scaled, y)
def test_parallel_fit(global_random_seed):
"""Check parallel backend of VotingClassifier on toy dataset."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1
).fit(X, y)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2
).fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
@ignore_warnings(category=FutureWarning)
def test_sample_weight(global_random_seed):
"""Tests sample_weight parameter of VotingClassifier"""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = SVC(probability=True, random_state=global_random_seed)
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
).fit(X_scaled, y, sample_weight=np.ones((len(y),)))
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
).fit(X_scaled, y)
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
assert_array_almost_equal(
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
)
sample_weight = np.random.RandomState(global_random_seed).uniform(size=(len(y),))
eclf3 = VotingClassifier(estimators=[("lr", clf1)], voting="soft")
eclf3.fit(X_scaled, y, sample_weight)
clf1.fit(X_scaled, y, sample_weight)
assert_array_equal(eclf3.predict(X_scaled), clf1.predict(X_scaled))
assert_array_almost_equal(
eclf3.predict_proba(X_scaled), clf1.predict_proba(X_scaled)
)
# check that an error is raised and indicative if sample_weight is not
# supported.
clf4 = KNeighborsClassifier()
eclf3 = VotingClassifier(
estimators=[("lr", clf1), ("svc", clf3), ("knn", clf4)], voting="soft"
)
msg = "Underlying estimator KNeighborsClassifier does not support sample weights."
with pytest.raises(TypeError, match=msg):
eclf3.fit(X_scaled, y, sample_weight)
# check that _fit_single_estimator will raise the right error
# it should raise the original error if this is not linked to sample_weight
class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
def fit(self, X_scaled, y, sample_weight):
raise TypeError("Error unrelated to sample_weight.")
clf = ClassifierErrorFit()
with pytest.raises(TypeError, match="Error unrelated to sample_weight"):
clf.fit(X_scaled, y, sample_weight=sample_weight)
def test_sample_weight_kwargs():
"""Check that VotingClassifier passes sample_weight as kwargs"""
class MockClassifier(ClassifierMixin, BaseEstimator):
"""Mock Classifier to check that sample_weight is received as kwargs"""
def fit(self, X, y, *args, **sample_weight):
assert "sample_weight" in sample_weight
clf = MockClassifier()
eclf = VotingClassifier(estimators=[("mock", clf)], voting="soft")
# Should not raise an error.
eclf.fit(X, y, sample_weight=np.ones((len(y),)))
def test_voting_classifier_set_params(global_random_seed):
# check equivalence in the output when setting underlying estimators
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(
n_estimators=10, random_state=global_random_seed, max_depth=None
)
clf3 = GaussianNB()
eclf1 = VotingClassifier(
[("lr", clf1), ("rf", clf2)], voting="soft", weights=[1, 2]
).fit(X_scaled, y)
eclf2 = VotingClassifier(
[("lr", clf1), ("nb", clf3)], voting="soft", weights=[1, 2]
)
eclf2.set_params(nb=clf2).fit(X_scaled, y)
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
assert_array_almost_equal(
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
)
assert eclf2.estimators[0][1].get_params() == clf1.get_params()
assert eclf2.estimators[1][1].get_params() == clf2.get_params()
def test_set_estimator_drop():
# VotingClassifier set_params should be able to set estimators as drop
# Test predict
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
clf3 = GaussianNB()
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
voting="hard",
weights=[1, 0, 0.5],
).fit(X, y)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
voting="hard",
weights=[1, 1, 0.5],
)
eclf2.set_params(rf="drop").fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert dict(eclf2.estimators)["rf"] == "drop"
assert len(eclf2.estimators_) == 2
assert all(
isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_
)
assert eclf2.get_params()["rf"] == "drop"
eclf1.set_params(voting="soft").fit(X, y)
eclf2.set_params(voting="soft").fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
msg = "All estimators are dropped. At least one is required"
with pytest.raises(ValueError, match=msg):
eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)
# Test soft voting transform
X1 = np.array([[1], [2]])
y1 = np.array([1, 2])
eclf1 = VotingClassifier(
estimators=[("rf", clf2), ("nb", clf3)],
voting="soft",
weights=[0, 0.5],
flatten_transform=False,
).fit(X1, y1)
eclf2 = VotingClassifier(
estimators=[("rf", clf2), ("nb", clf3)],
voting="soft",
weights=[1, 0.5],
flatten_transform=False,
)
eclf2.set_params(rf="drop").fit(X1, y1)
assert_array_almost_equal(
eclf1.transform(X1),
np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
)
assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]]))
eclf1.set_params(voting="hard")
eclf2.set_params(voting="hard")
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_estimator_weights_format(global_random_seed):
# Test estimator weights inputs as list and array
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft"
)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft"
)
eclf1.fit(X_scaled, y)
eclf2.fit(X_scaled, y)
assert_array_almost_equal(
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
)
def test_transform(global_random_seed):
"""Check transform method of VotingClassifier on toy dataset."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
).fit(X, y)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
flatten_transform=True,
).fit(X, y)
eclf3 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
flatten_transform=False,
).fit(X, y)
assert_array_equal(eclf1.transform(X).shape, (4, 6))
assert_array_equal(eclf2.transform(X).shape, (4, 6))
assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
assert_array_almost_equal(
eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)
)
@pytest.mark.parametrize(
"X, y, voter",
[
(
X,
y,
VotingClassifier(
[
("lr", LogisticRegression()),
("rf", RandomForestClassifier(n_estimators=5)),
]
),
),
(
X_r,
y_r,
VotingRegressor(
[
("lr", LinearRegression()),
("rf", RandomForestRegressor(n_estimators=5)),
]
),
),
],
)
def test_none_estimator_with_weights(X, y, voter):
# check that an estimator can be set to 'drop' and passing some weight
# regression test for
# https://github.com/scikit-learn/scikit-learn/issues/13777
voter = clone(voter)
# Scaled to solve ConvergenceWarning throw by Logistic Regression
X_scaled = StandardScaler().fit_transform(X)
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
voter.set_params(lr="drop")
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
y_pred = voter.predict(X_scaled)
assert y_pred.shape == y.shape
@pytest.mark.parametrize(
"est",
[
VotingRegressor(
estimators=[
("lr", LinearRegression()),
("tree", DecisionTreeRegressor(random_state=0)),
]
),
VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("tree", DecisionTreeClassifier(random_state=0)),
]
),
],
ids=["VotingRegressor", "VotingClassifier"],
)
def test_n_features_in(est):
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
assert not hasattr(est, "n_features_in_")
est.fit(X, y)
assert est.n_features_in_ == 2
@pytest.mark.parametrize(
"estimator",
[
VotingRegressor(
estimators=[
("lr", LinearRegression()),
("rf", RandomForestRegressor(random_state=123)),
],
verbose=True,
),
VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=123)),
("rf", RandomForestClassifier(random_state=123)),
],
verbose=True,
),
],
)
def test_voting_verbose(estimator, capsys):
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
pattern = (
r"\[Voting\].*\(1 of 2\) Processing lr, total=.*\n"
r"\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$"
)
estimator.fit(X, y)
assert re.match(pattern, capsys.readouterr()[0])
def test_get_features_names_out_regressor():
"""Check get_feature_names_out output for regressor."""
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
voting = VotingRegressor(
estimators=[
("lr", LinearRegression()),
("tree", DecisionTreeRegressor(random_state=0)),
("ignore", "drop"),
]
)
voting.fit(X, y)
names_out = voting.get_feature_names_out()
expected_names = ["votingregressor_lr", "votingregressor_tree"]
assert_array_equal(names_out, expected_names)
@pytest.mark.parametrize(
"kwargs, expected_names",
[
(
{"voting": "soft", "flatten_transform": True},
[
"votingclassifier_lr0",
"votingclassifier_lr1",
"votingclassifier_lr2",
"votingclassifier_tree0",
"votingclassifier_tree1",
"votingclassifier_tree2",
],
),
({"voting": "hard"}, ["votingclassifier_lr", "votingclassifier_tree"]),
],
)
def test_get_features_names_out_classifier(kwargs, expected_names):
"""Check get_feature_names_out for classifier for different settings."""
X = [[1, 2], [3, 4], [5, 6], [1, 1.2]]
y = [0, 1, 2, 0]
voting = VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("tree", DecisionTreeClassifier(random_state=0)),
],
**kwargs,
)
voting.fit(X, y)
X_trans = voting.transform(X)
names_out = voting.get_feature_names_out()
assert X_trans.shape[1] == len(expected_names)
assert_array_equal(names_out, expected_names)
def test_get_features_names_out_classifier_error():
"""Check that error is raised when voting="soft" and flatten_transform=False."""
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
voting = VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("tree", DecisionTreeClassifier(random_state=0)),
],
voting="soft",
flatten_transform=False,
)
voting.fit(X, y)
msg = (
"get_feature_names_out is not supported when `voting='soft'` and "
"`flatten_transform=False`"
)
with pytest.raises(ValueError, match=msg):
voting.get_feature_names_out()
# Metadata Routing Tests
# ======================
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
def test_routing_passed_metadata_not_supported(Estimator, Child):
"""Test that the right error message is raised when metadata is passed while
not supported when `enable_metadata_routing=False`."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
with pytest.raises(
ValueError, match="is only supported if enable_metadata_routing=True"
):
Estimator(["clf", Child()]).fit(X, y, sample_weight=[1, 1, 1], metadata="a")
@pytest.mark.usefixtures("enable_slep006")
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
def test_get_metadata_routing_without_fit(Estimator, Child):
# Test that metadata_routing() doesn't raise when called before fit.
est = Estimator([("sub_est", Child())])
est.get_metadata_routing()
@pytest.mark.usefixtures("enable_slep006")
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
@pytest.mark.parametrize("prop", ["sample_weight", "metadata"])
def test_metadata_routing_for_voting_estimators(Estimator, Child, prop):
"""Test that metadata is routed correctly for Voting*."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
sample_weight, metadata = [1, 1, 1], "a"
est = Estimator(
[
(
"sub_est1",
Child(registry=_Registry()).set_fit_request(**{prop: True}),
),
(
"sub_est2",
Child(registry=_Registry()).set_fit_request(**{prop: True}),
),
]
)
est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata})
for estimator in est.estimators:
if prop == "sample_weight":
kwargs = {prop: sample_weight}
else:
kwargs = {prop: metadata}
# access sub-estimator in (name, est) with estimator[1]
registry = estimator[1].registry
assert len(registry)
for sub_est in registry:
check_recorded_metadata(obj=sub_est, method="fit", **kwargs)
@pytest.mark.usefixtures("enable_slep006")
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
def test_metadata_routing_error_for_voting_estimators(Estimator, Child):
"""Test that the right error is raised when metadata is not requested."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
sample_weight, metadata = [1, 1, 1], "a"
est = Estimator([("sub_est", Child())])
error_message = (
"[sample_weight, metadata] are passed but are not explicitly set as requested"
f" or not requested for {Child.__name__}.fit"
)
with pytest.raises(ValueError, match=re.escape(error_message)):
est.fit(X, y, sample_weight=sample_weight, metadata=metadata)
# End of Metadata Routing Tests
# =============================

View File

@ -0,0 +1,705 @@
"""Testing for the boost module (sklearn.ensemble.boost)."""
import re
import numpy as np
import pytest
from sklearn import datasets
from sklearn.base import BaseEstimator, clone
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble._weight_boosting import _samme_proba
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle
from sklearn.utils._mocking import NoSampleWeightWrapper
from sklearn.utils._testing import (
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
assert_array_less,
)
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
# Common random state
rng = np.random.RandomState(0)
# Toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
y_regr = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
y_t_class = ["foo", 1, 1]
y_t_regr = [-1, 1, 1]
# Load the iris dataset and randomly permute it
iris = datasets.load_iris()
perm = rng.permutation(iris.target.size)
iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
# Load the diabetes dataset and randomly permute it
diabetes = datasets.load_diabetes()
diabetes.data, diabetes.target = shuffle(
diabetes.data, diabetes.target, random_state=rng
)
def test_samme_proba():
# Test the `_samme_proba` helper function.
# Define some example (bad) `predict_proba` output.
probs = np.array(
[[1, 1e-6, 0], [0.19, 0.6, 0.2], [-999, 0.51, 0.5], [1e-6, 1, 1e-9]]
)
probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
# _samme_proba calls estimator.predict_proba.
# Make a mock object so I can control what gets returned.
class MockEstimator:
def predict_proba(self, X):
assert_array_equal(X.shape, probs.shape)
return probs
mock = MockEstimator()
samme_proba = _samme_proba(mock, 3, np.ones_like(probs))
assert_array_equal(samme_proba.shape, probs.shape)
assert np.isfinite(samme_proba).all()
# Make sure that the correct elements come out as smallest --
# `_samme_proba` should preserve the ordering in each example.
assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2])
assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1])
def test_oneclass_adaboost_proba():
# Test predict_proba robustness for one class label input.
# In response to issue #7501
# https://github.com/scikit-learn/scikit-learn/issues/7501
y_t = np.ones(len(X))
clf = AdaBoostClassifier(algorithm="SAMME").fit(X, y_t)
assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
# and substituted with the SAMME algorithm as a default; also re-write test to
# only consider "SAMME"
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
def test_classification_toy(algorithm):
# Check classification on a toy dataset.
clf = AdaBoostClassifier(algorithm=algorithm, random_state=0)
clf.fit(X, y_class)
assert_array_equal(clf.predict(T), y_t_class)
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
assert clf.predict_proba(T).shape == (len(T), 2)
assert clf.decision_function(T).shape == (len(T),)
def test_regression_toy():
# Check classification on a toy dataset.
clf = AdaBoostRegressor(random_state=0)
clf.fit(X, y_regr)
assert_array_equal(clf.predict(T), y_t_regr)
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
# and substituted with the SAMME algorithm as a default; also re-write test to
# only consider "SAMME"
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
def test_iris():
# Check consistency on dataset iris.
classes = np.unique(iris.target)
clf_samme = prob_samme = None
for alg in ["SAMME", "SAMME.R"]:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(iris.data, iris.target)
assert_array_equal(classes, clf.classes_)
proba = clf.predict_proba(iris.data)
if alg == "SAMME":
clf_samme = clf
prob_samme = proba
assert proba.shape[1] == len(classes)
assert clf.decision_function(iris.data).shape[1] == len(classes)
score = clf.score(iris.data, iris.target)
assert score > 0.9, "Failed with algorithm %s and score = %f" % (alg, score)
# Check we used multiple estimators
assert len(clf.estimators_) > 1
# Check for distinct random states (see issue #7408)
assert len(set(est.random_state for est in clf.estimators_)) == len(
clf.estimators_
)
# Somewhat hacky regression test: prior to
# ae7adc880d624615a34bafdb1d75ef67051b8200,
# predict_proba returned SAMME.R values for SAMME.
clf_samme.algorithm = "SAMME.R"
assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
@pytest.mark.parametrize("loss", ["linear", "square", "exponential"])
def test_diabetes(loss):
# Check consistency on dataset diabetes.
reg = AdaBoostRegressor(loss=loss, random_state=0)
reg.fit(diabetes.data, diabetes.target)
score = reg.score(diabetes.data, diabetes.target)
assert score > 0.55
# Check we used multiple estimators
assert len(reg.estimators_) > 1
# Check for distinct random states (see issue #7408)
assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_)
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
# and substituted with the SAMME algorithm as a default; also re-write test to
# only consider "SAMME"
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
def test_staged_predict(algorithm):
# Check staged predictions.
rng = np.random.RandomState(0)
iris_weights = rng.randint(10, size=iris.target.shape)
diabetes_weights = rng.randint(10, size=diabetes.target.shape)
clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10)
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
predictions = clf.predict(iris.data)
staged_predictions = [p for p in clf.staged_predict(iris.data)]
proba = clf.predict_proba(iris.data)
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
staged_scores = [
s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights)
]
assert len(staged_predictions) == 10
assert_array_almost_equal(predictions, staged_predictions[-1])
assert len(staged_probas) == 10
assert_array_almost_equal(proba, staged_probas[-1])
assert len(staged_scores) == 10
assert_array_almost_equal(score, staged_scores[-1])
# AdaBoost regression
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
predictions = clf.predict(diabetes.data)
staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
staged_scores = [
s
for s in clf.staged_score(
diabetes.data, diabetes.target, sample_weight=diabetes_weights
)
]
assert len(staged_predictions) == 10
assert_array_almost_equal(predictions, staged_predictions[-1])
assert len(staged_scores) == 10
assert_array_almost_equal(score, staged_scores[-1])
def test_gridsearch():
# Check that base trees can be grid-searched.
# AdaBoost classification
boost = AdaBoostClassifier(estimator=DecisionTreeClassifier())
parameters = {
"n_estimators": (1, 2),
"estimator__max_depth": (1, 2),
"algorithm": ("SAMME", "SAMME.R"),
}
clf = GridSearchCV(boost, parameters)
clf.fit(iris.data, iris.target)
# AdaBoost regression
boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0)
parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)}
clf = GridSearchCV(boost, parameters)
clf.fit(diabetes.data, diabetes.target)
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
# and substituted with the SAMME algorithm as a default; also re-write test to
# only consider "SAMME"
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
def test_pickle():
# Check pickability.
import pickle
# Adaboost classifier
for alg in ["SAMME", "SAMME.R"]:
obj = AdaBoostClassifier(algorithm=alg)
obj.fit(iris.data, iris.target)
score = obj.score(iris.data, iris.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(iris.data, iris.target)
assert score == score2
# Adaboost regressor
obj = AdaBoostRegressor(random_state=0)
obj.fit(diabetes.data, diabetes.target)
score = obj.score(diabetes.data, diabetes.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(diabetes.data, diabetes.target)
assert score == score2
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
# and substituted with the SAMME algorithm as a default; also re-write test to
# only consider "SAMME"
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
def test_importances():
# Check variable importances.
X, y = datasets.make_classification(
n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=1,
)
for alg in ["SAMME", "SAMME.R"]:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(X, y)
importances = clf.feature_importances_
assert importances.shape[0] == 10
assert (importances[:3, np.newaxis] >= importances[3:]).all()
def test_adaboost_classifier_sample_weight_error():
# Test that it gives proper exception on incorrect sample weight.
clf = AdaBoostClassifier()
msg = re.escape("sample_weight.shape == (1,), expected (6,)")
with pytest.raises(ValueError, match=msg):
clf.fit(X, y_class, sample_weight=np.asarray([-1]))
def test_estimator():
# Test different estimators.
from sklearn.ensemble import RandomForestClassifier
# XXX doesn't work with y_class because RF doesn't support classes_
# Shouldn't AdaBoost run a LabelBinarizer?
clf = AdaBoostClassifier(RandomForestClassifier(), algorithm="SAMME")
clf.fit(X, y_regr)
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
clf.fit(X, y_class)
from sklearn.ensemble import RandomForestRegressor
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
clf.fit(X, y_regr)
clf = AdaBoostRegressor(SVR(), random_state=0)
clf.fit(X, y_regr)
# Check that an empty discrete ensemble fails in fit, not predict.
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
y_fail = ["foo", "bar", 1, 2]
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
with pytest.raises(ValueError, match="worse than random"):
clf.fit(X_fail, y_fail)
def test_sample_weights_infinite():
msg = "Sample weights have reached infinite values"
clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0, algorithm="SAMME")
with pytest.warns(UserWarning, match=msg):
clf.fit(iris.data, iris.target)
@pytest.mark.parametrize(
"sparse_container, expected_internal_type",
zip(
[
*CSC_CONTAINERS,
*CSR_CONTAINERS,
*LIL_CONTAINERS,
*COO_CONTAINERS,
*DOK_CONTAINERS,
],
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
),
)
def test_sparse_classification(sparse_container, expected_internal_type):
# Check classification with sparse input.
class CustomSVC(SVC):
"""SVC variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super().fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_multilabel_classification(
n_classes=1, n_samples=15, n_features=5, random_state=42
)
# Flatten y to a 1d array
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
# Trained on sparse format
sparse_classifier = AdaBoostClassifier(
estimator=CustomSVC(probability=True),
random_state=1,
algorithm="SAMME",
).fit(X_train_sparse, y_train)
# Trained on dense format
dense_classifier = AdaBoostClassifier(
estimator=CustomSVC(probability=True),
random_state=1,
algorithm="SAMME",
).fit(X_train, y_train)
# predict
sparse_clf_results = sparse_classifier.predict(X_test_sparse)
dense_clf_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_clf_results, dense_clf_results)
# decision_function
sparse_clf_results = sparse_classifier.decision_function(X_test_sparse)
dense_clf_results = dense_classifier.decision_function(X_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# predict_log_proba
sparse_clf_results = sparse_classifier.predict_log_proba(X_test_sparse)
dense_clf_results = dense_classifier.predict_log_proba(X_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# predict_proba
sparse_clf_results = sparse_classifier.predict_proba(X_test_sparse)
dense_clf_results = dense_classifier.predict_proba(X_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# score
sparse_clf_results = sparse_classifier.score(X_test_sparse, y_test)
dense_clf_results = dense_classifier.score(X_test, y_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# staged_decision_function
sparse_clf_results = sparse_classifier.staged_decision_function(X_test_sparse)
dense_clf_results = dense_classifier.staged_decision_function(X_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
# staged_predict
sparse_clf_results = sparse_classifier.staged_predict(X_test_sparse)
dense_clf_results = dense_classifier.staged_predict(X_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_equal(sparse_clf_res, dense_clf_res)
# staged_predict_proba
sparse_clf_results = sparse_classifier.staged_predict_proba(X_test_sparse)
dense_clf_results = dense_classifier.staged_predict_proba(X_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
# staged_score
sparse_clf_results = sparse_classifier.staged_score(X_test_sparse, y_test)
dense_clf_results = dense_classifier.staged_score(X_test, y_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_equal(sparse_clf_res, dense_clf_res)
# Verify sparsity of data is maintained during training
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert all([t == expected_internal_type for t in types])
@pytest.mark.parametrize(
"sparse_container, expected_internal_type",
zip(
[
*CSC_CONTAINERS,
*CSR_CONTAINERS,
*LIL_CONTAINERS,
*COO_CONTAINERS,
*DOK_CONTAINERS,
],
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
),
)
def test_sparse_regression(sparse_container, expected_internal_type):
# Check regression with sparse input.
class CustomSVR(SVR):
"""SVR variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super().fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_regression(
n_samples=15, n_features=50, n_targets=1, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
# Trained on sparse format
sparse_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
X_train_sparse, y_train
)
# Trained on dense format
dense_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
X_train, y_train
)
# predict
sparse_regr_results = sparse_regressor.predict(X_test_sparse)
dense_regr_results = dense_regressor.predict(X_test)
assert_array_almost_equal(sparse_regr_results, dense_regr_results)
# staged_predict
sparse_regr_results = sparse_regressor.staged_predict(X_test_sparse)
dense_regr_results = dense_regressor.staged_predict(X_test)
for sparse_regr_res, dense_regr_res in zip(sparse_regr_results, dense_regr_results):
assert_array_almost_equal(sparse_regr_res, dense_regr_res)
types = [i.data_type_ for i in sparse_regressor.estimators_]
assert all([t == expected_internal_type for t in types])
def test_sample_weight_adaboost_regressor():
"""
AdaBoostRegressor should work without sample_weights in the base estimator
The random weighted sampling is done internally in the _boost method in
AdaBoostRegressor.
"""
class DummyEstimator(BaseEstimator):
def fit(self, X, y):
pass
def predict(self, X):
return np.zeros(X.shape[0])
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
boost.fit(X, y_regr)
assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
def test_multidimensional_X():
"""
Check that the AdaBoost estimators can work with n-dimensional
data matrix
"""
rng = np.random.RandomState(0)
X = rng.randn(51, 3, 3)
yc = rng.choice([0, 1], 51)
yr = rng.randn(51)
boost = AdaBoostClassifier(
DummyClassifier(strategy="most_frequent"), algorithm="SAMME"
)
boost.fit(X, yc)
boost.predict(X)
boost.predict_proba(X)
boost = AdaBoostRegressor(DummyRegressor())
boost.fit(X, yr)
boost.predict(X)
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
# and substituted with the SAMME algorithm as a default; also re-write test to
# only consider "SAMME"
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
def test_adaboostclassifier_without_sample_weight(algorithm):
X, y = iris.data, iris.target
estimator = NoSampleWeightWrapper(DummyClassifier())
clf = AdaBoostClassifier(estimator=estimator, algorithm=algorithm)
err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__)
with pytest.raises(ValueError, match=err_msg):
clf.fit(X, y)
def test_adaboostregressor_sample_weight():
# check that giving weight will have an influence on the error computed
# for a weak learner
rng = np.random.RandomState(42)
X = np.linspace(0, 100, num=1000)
y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
X = X.reshape(-1, 1)
# add an arbitrary outlier
X[-1] *= 10
y[-1] = 10000
# random_state=0 ensure that the underlying bootstrap will use the outlier
regr_no_outlier = AdaBoostRegressor(
estimator=LinearRegression(), n_estimators=1, random_state=0
)
regr_with_weight = clone(regr_no_outlier)
regr_with_outlier = clone(regr_no_outlier)
# fit 3 models:
# - a model containing the outlier
# - a model without the outlier
# - a model containing the outlier but with a null sample-weight
regr_with_outlier.fit(X, y)
regr_no_outlier.fit(X[:-1], y[:-1])
sample_weight = np.ones_like(y)
sample_weight[-1] = 0
regr_with_weight.fit(X, y, sample_weight=sample_weight)
score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
assert score_with_outlier < score_no_outlier
assert score_with_outlier < score_with_weight
assert score_no_outlier == pytest.approx(score_with_weight)
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
# and substituted with the SAMME algorithm as a default; also re-write test to
# only consider "SAMME"
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
def test_adaboost_consistent_predict(algorithm):
# check that predict_proba and predict give consistent results
# regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/14084
X_train, X_test, y_train, y_test = train_test_split(
*datasets.load_digits(return_X_y=True), random_state=42
)
model = AdaBoostClassifier(algorithm=algorithm, random_state=42)
model.fit(X_train, y_train)
assert_array_equal(
np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test)
)
@pytest.mark.parametrize(
"model, X, y",
[
(AdaBoostClassifier(), iris.data, iris.target),
(AdaBoostRegressor(), diabetes.data, diabetes.target),
],
)
def test_adaboost_negative_weight_error(model, X, y):
sample_weight = np.ones_like(y)
sample_weight[-1] = -10
err_msg = "Negative values in data passed to `sample_weight`"
with pytest.raises(ValueError, match=err_msg):
model.fit(X, y, sample_weight=sample_weight)
def test_adaboost_numerically_stable_feature_importance_with_small_weights():
"""Check that we don't create NaN feature importance with numerically
instable inputs.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/20320
"""
rng = np.random.RandomState(42)
X = rng.normal(size=(1000, 10))
y = rng.choice([0, 1], size=1000)
sample_weight = np.ones_like(y) * 1e-263
tree = DecisionTreeClassifier(max_depth=10, random_state=12)
ada_model = AdaBoostClassifier(
estimator=tree, n_estimators=20, algorithm="SAMME", random_state=12
)
ada_model.fit(X, y, sample_weight=sample_weight)
assert np.isnan(ada_model.feature_importances_).sum() == 0
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
# and substituted with the SAMME algorithm as a default; also re-write test to
# only consider "SAMME"
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
def test_adaboost_decision_function(algorithm, global_random_seed):
"""Check that the decision function respects the symmetric constraint for weak
learners.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/26520
"""
n_classes = 3
X, y = datasets.make_classification(
n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed
)
clf = AdaBoostClassifier(
n_estimators=1, random_state=global_random_seed, algorithm=algorithm
).fit(X, y)
y_score = clf.decision_function(X)
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
if algorithm == "SAMME":
# With a single learner, we expect to have a decision function in
# {1, - 1 / (n_classes - 1)}.
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
# We can assert the same for staged_decision_function since we have a single learner
for y_score in clf.staged_decision_function(X):
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
if algorithm == "SAMME":
# With a single learner, we expect to have a decision function in
# {1, - 1 / (n_classes - 1)}.
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
clf.set_params(n_estimators=5).fit(X, y)
y_score = clf.decision_function(X)
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
for y_score in clf.staged_decision_function(X):
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
# TODO(1.6): remove
def test_deprecated_samme_r_algorithm():
adaboost_clf = AdaBoostClassifier(n_estimators=1)
with pytest.warns(
FutureWarning,
match=re.escape("The SAMME.R algorithm (the default) is deprecated"),
):
adaboost_clf.fit(X, y_class)