reconnect moved files to git repo
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,976 @@
|
||||
"""
|
||||
Testing for the bagging ensemble module (sklearn.ensemble.bagging).
|
||||
"""
|
||||
|
||||
# Author: Gilles Louppe
|
||||
# License: BSD 3 clause
|
||||
from itertools import cycle, product
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import sklearn
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.ensemble import (
|
||||
AdaBoostClassifier,
|
||||
AdaBoostRegressor,
|
||||
BaggingClassifier,
|
||||
BaggingRegressor,
|
||||
HistGradientBoostingClassifier,
|
||||
HistGradientBoostingRegressor,
|
||||
RandomForestClassifier,
|
||||
RandomForestRegressor,
|
||||
)
|
||||
from sklearn.feature_selection import SelectKBest
|
||||
from sklearn.linear_model import LogisticRegression, Perceptron
|
||||
from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import FunctionTransformer, scale
|
||||
from sklearn.random_projection import SparseRandomProjection
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
rng = check_random_state(0)
|
||||
|
||||
# also load the iris dataset
|
||||
# and randomly permute it
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
# also load the diabetes dataset
|
||||
# and randomly permute it
|
||||
diabetes = load_diabetes()
|
||||
perm = rng.permutation(diabetes.target.size)
|
||||
diabetes.data = diabetes.data[perm]
|
||||
diabetes.target = diabetes.target[perm]
|
||||
|
||||
|
||||
def test_classification():
|
||||
# Check classification for various parameter settings.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=rng
|
||||
)
|
||||
grid = ParameterGrid(
|
||||
{
|
||||
"max_samples": [0.5, 1.0],
|
||||
"max_features": [1, 4],
|
||||
"bootstrap": [True, False],
|
||||
"bootstrap_features": [True, False],
|
||||
}
|
||||
)
|
||||
estimators = [
|
||||
None,
|
||||
DummyClassifier(),
|
||||
Perceptron(max_iter=20),
|
||||
DecisionTreeClassifier(max_depth=2),
|
||||
KNeighborsClassifier(),
|
||||
SVC(),
|
||||
]
|
||||
# Try different parameter settings with different base classifiers without
|
||||
# doing the full cartesian product to keep the test durations low.
|
||||
for params, estimator in zip(grid, cycle(estimators)):
|
||||
BaggingClassifier(
|
||||
estimator=estimator,
|
||||
random_state=rng,
|
||||
n_estimators=2,
|
||||
**params,
|
||||
).fit(X_train, y_train).predict(X_test)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container, params, method",
|
||||
product(
|
||||
CSR_CONTAINERS + CSC_CONTAINERS,
|
||||
[
|
||||
{
|
||||
"max_samples": 0.5,
|
||||
"max_features": 2,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True,
|
||||
},
|
||||
{
|
||||
"max_samples": 1.0,
|
||||
"max_features": 4,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True,
|
||||
},
|
||||
{"max_features": 2, "bootstrap": False, "bootstrap_features": True},
|
||||
{"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
|
||||
],
|
||||
["predict", "predict_proba", "predict_log_proba", "decision_function"],
|
||||
),
|
||||
)
|
||||
def test_sparse_classification(sparse_container, params, method):
|
||||
# Check classification for various parameter settings on sparse input.
|
||||
|
||||
class CustomSVC(SVC):
|
||||
"""SVC variant that records the nature of the training set"""
|
||||
|
||||
def fit(self, X, y):
|
||||
super().fit(X, y)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
scale(iris.data), iris.target, random_state=rng
|
||||
)
|
||||
|
||||
X_train_sparse = sparse_container(X_train)
|
||||
X_test_sparse = sparse_container(X_test)
|
||||
# Trained on sparse format
|
||||
sparse_classifier = BaggingClassifier(
|
||||
estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
|
||||
random_state=1,
|
||||
**params,
|
||||
).fit(X_train_sparse, y_train)
|
||||
sparse_results = getattr(sparse_classifier, method)(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = BaggingClassifier(
|
||||
estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
|
||||
random_state=1,
|
||||
**params,
|
||||
).fit(X_train, y_train)
|
||||
dense_results = getattr(dense_classifier, method)(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
sparse_type = type(X_train_sparse)
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert all([t == sparse_type for t in types])
|
||||
|
||||
|
||||
def test_regression():
|
||||
# Check regression for various parameter settings.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data[:50], diabetes.target[:50], random_state=rng
|
||||
)
|
||||
grid = ParameterGrid(
|
||||
{
|
||||
"max_samples": [0.5, 1.0],
|
||||
"max_features": [0.5, 1.0],
|
||||
"bootstrap": [True, False],
|
||||
"bootstrap_features": [True, False],
|
||||
}
|
||||
)
|
||||
|
||||
for estimator in [
|
||||
None,
|
||||
DummyRegressor(),
|
||||
DecisionTreeRegressor(),
|
||||
KNeighborsRegressor(),
|
||||
SVR(),
|
||||
]:
|
||||
for params in grid:
|
||||
BaggingRegressor(estimator=estimator, random_state=rng, **params).fit(
|
||||
X_train, y_train
|
||||
).predict(X_test)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
|
||||
def test_sparse_regression(sparse_container):
|
||||
# Check regression for various parameter settings on sparse input.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data[:50], diabetes.target[:50], random_state=rng
|
||||
)
|
||||
|
||||
class CustomSVR(SVR):
|
||||
"""SVC variant that records the nature of the training set"""
|
||||
|
||||
def fit(self, X, y):
|
||||
super().fit(X, y)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
parameter_sets = [
|
||||
{
|
||||
"max_samples": 0.5,
|
||||
"max_features": 2,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True,
|
||||
},
|
||||
{
|
||||
"max_samples": 1.0,
|
||||
"max_features": 4,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True,
|
||||
},
|
||||
{"max_features": 2, "bootstrap": False, "bootstrap_features": True},
|
||||
{"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
|
||||
]
|
||||
|
||||
X_train_sparse = sparse_container(X_train)
|
||||
X_test_sparse = sparse_container(X_test)
|
||||
for params in parameter_sets:
|
||||
# Trained on sparse format
|
||||
sparse_classifier = BaggingRegressor(
|
||||
estimator=CustomSVR(), random_state=1, **params
|
||||
).fit(X_train_sparse, y_train)
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_results = (
|
||||
BaggingRegressor(estimator=CustomSVR(), random_state=1, **params)
|
||||
.fit(X_train, y_train)
|
||||
.predict(X_test)
|
||||
)
|
||||
|
||||
sparse_type = type(X_train_sparse)
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
assert all([t == sparse_type for t in types])
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
|
||||
class DummySizeEstimator(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.training_size_ = X.shape[0]
|
||||
self.training_hash_ = joblib.hash(X)
|
||||
|
||||
def predict(self, X):
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
|
||||
def test_bootstrap_samples():
|
||||
# Test that bootstrapping samples generate non-perfect base estimators.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
estimator = DecisionTreeRegressor().fit(X_train, y_train)
|
||||
|
||||
# without bootstrap, all trees are perfect on the training set
|
||||
ensemble = BaggingRegressor(
|
||||
estimator=DecisionTreeRegressor(),
|
||||
max_samples=1.0,
|
||||
bootstrap=False,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert estimator.score(X_train, y_train) == ensemble.score(X_train, y_train)
|
||||
|
||||
# with bootstrap, trees are no longer perfect on the training set
|
||||
ensemble = BaggingRegressor(
|
||||
estimator=DecisionTreeRegressor(),
|
||||
max_samples=1.0,
|
||||
bootstrap=True,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert estimator.score(X_train, y_train) > ensemble.score(X_train, y_train)
|
||||
|
||||
# check that each sampling correspond to a complete bootstrap resample.
|
||||
# the size of each bootstrap should be the same as the input data but
|
||||
# the data should be different (checked using the hash of the data).
|
||||
ensemble = BaggingRegressor(estimator=DummySizeEstimator(), bootstrap=True).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
training_hash = []
|
||||
for estimator in ensemble.estimators_:
|
||||
assert estimator.training_size_ == X_train.shape[0]
|
||||
training_hash.append(estimator.training_hash_)
|
||||
assert len(set(training_hash)) == len(training_hash)
|
||||
|
||||
|
||||
def test_bootstrap_features():
|
||||
# Test that bootstrapping features may generate duplicate features.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
ensemble = BaggingRegressor(
|
||||
estimator=DecisionTreeRegressor(),
|
||||
max_features=1.0,
|
||||
bootstrap_features=False,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
for features in ensemble.estimators_features_:
|
||||
assert diabetes.data.shape[1] == np.unique(features).shape[0]
|
||||
|
||||
ensemble = BaggingRegressor(
|
||||
estimator=DecisionTreeRegressor(),
|
||||
max_features=1.0,
|
||||
bootstrap_features=True,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
for features in ensemble.estimators_features_:
|
||||
assert diabetes.data.shape[1] > np.unique(features).shape[0]
|
||||
|
||||
|
||||
def test_probability():
|
||||
# Predict probabilities.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=rng
|
||||
)
|
||||
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
# Normal case
|
||||
ensemble = BaggingClassifier(
|
||||
estimator=DecisionTreeClassifier(), random_state=rng
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert_array_almost_equal(
|
||||
np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
|
||||
)
|
||||
|
||||
assert_array_almost_equal(
|
||||
ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
|
||||
)
|
||||
|
||||
# Degenerate case, where some classes are missing
|
||||
ensemble = BaggingClassifier(
|
||||
estimator=LogisticRegression(), random_state=rng, max_samples=5
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert_array_almost_equal(
|
||||
np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
|
||||
)
|
||||
|
||||
assert_array_almost_equal(
|
||||
ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
|
||||
)
|
||||
|
||||
|
||||
def test_oob_score_classification():
|
||||
# Check that oob prediction is a good estimation of the generalization
|
||||
# error.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=rng
|
||||
)
|
||||
|
||||
for estimator in [DecisionTreeClassifier(), SVC()]:
|
||||
clf = BaggingClassifier(
|
||||
estimator=estimator,
|
||||
n_estimators=100,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
test_score = clf.score(X_test, y_test)
|
||||
|
||||
assert abs(test_score - clf.oob_score_) < 0.1
|
||||
|
||||
# Test with few estimators
|
||||
warn_msg = (
|
||||
"Some inputs do not have OOB scores. This probably means too few "
|
||||
"estimators were used to compute any reliable oob estimates."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
clf = BaggingClassifier(
|
||||
estimator=estimator,
|
||||
n_estimators=1,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng,
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
|
||||
def test_oob_score_regression():
|
||||
# Check that oob prediction is a good estimation of the generalization
|
||||
# error.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
clf = BaggingRegressor(
|
||||
estimator=DecisionTreeRegressor(),
|
||||
n_estimators=50,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
test_score = clf.score(X_test, y_test)
|
||||
|
||||
assert abs(test_score - clf.oob_score_) < 0.1
|
||||
|
||||
# Test with few estimators
|
||||
warn_msg = (
|
||||
"Some inputs do not have OOB scores. This probably means too few "
|
||||
"estimators were used to compute any reliable oob estimates."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
regr = BaggingRegressor(
|
||||
estimator=DecisionTreeRegressor(),
|
||||
n_estimators=1,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng,
|
||||
)
|
||||
regr.fit(X_train, y_train)
|
||||
|
||||
|
||||
def test_single_estimator():
|
||||
# Check singleton ensembles.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
clf1 = BaggingRegressor(
|
||||
estimator=KNeighborsRegressor(),
|
||||
n_estimators=1,
|
||||
bootstrap=False,
|
||||
bootstrap_features=False,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
clf2 = KNeighborsRegressor().fit(X_train, y_train)
|
||||
|
||||
assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
|
||||
|
||||
|
||||
def test_error():
|
||||
# Test support of decision_function
|
||||
X, y = iris.data, iris.target
|
||||
base = DecisionTreeClassifier()
|
||||
assert not hasattr(BaggingClassifier(base).fit(X, y), "decision_function")
|
||||
|
||||
|
||||
def test_parallel_classification():
|
||||
# Check parallel classification.
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=0
|
||||
)
|
||||
|
||||
ensemble = BaggingClassifier(
|
||||
DecisionTreeClassifier(), n_jobs=3, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# predict_proba
|
||||
y1 = ensemble.predict_proba(X_test)
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y2 = ensemble.predict_proba(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = BaggingClassifier(
|
||||
DecisionTreeClassifier(), n_jobs=1, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
y3 = ensemble.predict_proba(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
# decision_function
|
||||
ensemble = BaggingClassifier(
|
||||
SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
decisions1 = ensemble.decision_function(X_test)
|
||||
ensemble.set_params(n_jobs=1)
|
||||
decisions2 = ensemble.decision_function(X_test)
|
||||
assert_array_almost_equal(decisions1, decisions2)
|
||||
|
||||
ensemble = BaggingClassifier(
|
||||
SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
decisions3 = ensemble.decision_function(X_test)
|
||||
assert_array_almost_equal(decisions1, decisions3)
|
||||
|
||||
|
||||
def test_parallel_regression():
|
||||
# Check parallel regression.
|
||||
rng = check_random_state(0)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y1 = ensemble.predict(X_test)
|
||||
ensemble.set_params(n_jobs=2)
|
||||
y2 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
y3 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
# Check that bagging ensembles can be grid-searched.
|
||||
# Transform iris into a binary classification task
|
||||
X, y = iris.data, iris.target
|
||||
y[y == 2] = 1
|
||||
|
||||
# Grid search with scoring based on decision_function
|
||||
parameters = {"n_estimators": (1, 2), "estimator__C": (1, 2)}
|
||||
|
||||
GridSearchCV(BaggingClassifier(SVC()), parameters, scoring="roc_auc").fit(X, y)
|
||||
|
||||
|
||||
def test_estimator():
|
||||
# Check estimator and its default values.
|
||||
rng = check_random_state(0)
|
||||
|
||||
# Classification
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=rng
|
||||
)
|
||||
|
||||
ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.estimator_, DecisionTreeClassifier)
|
||||
|
||||
ensemble = BaggingClassifier(
|
||||
DecisionTreeClassifier(), n_jobs=3, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.estimator_, DecisionTreeClassifier)
|
||||
|
||||
ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
assert isinstance(ensemble.estimator_, Perceptron)
|
||||
|
||||
# Regression
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.estimator_, DecisionTreeRegressor)
|
||||
|
||||
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
assert isinstance(ensemble.estimator_, DecisionTreeRegressor)
|
||||
|
||||
ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train)
|
||||
assert isinstance(ensemble.estimator_, SVR)
|
||||
|
||||
|
||||
def test_bagging_with_pipeline():
|
||||
estimator = BaggingClassifier(
|
||||
make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2
|
||||
)
|
||||
estimator.fit(iris.data, iris.target)
|
||||
assert isinstance(estimator[0].steps[-1][1].random_state, int)
|
||||
|
||||
|
||||
class DummyZeroEstimator(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.classes_ = np.unique(y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
return self.classes_[np.zeros(X.shape[0], dtype=int)]
|
||||
|
||||
|
||||
def test_bagging_sample_weight_unsupported_but_passed():
|
||||
estimator = BaggingClassifier(DummyZeroEstimator())
|
||||
rng = check_random_state(0)
|
||||
|
||||
estimator.fit(iris.data, iris.target).predict(iris.data)
|
||||
with pytest.raises(ValueError):
|
||||
estimator.fit(
|
||||
iris.data,
|
||||
iris.target,
|
||||
sample_weight=rng.randint(10, size=(iris.data.shape[0])),
|
||||
)
|
||||
|
||||
|
||||
def test_warm_start(random_state=42):
|
||||
# Test if fitting incrementally with warm start gives a forest of the
|
||||
# right size and the same results as a normal fit.
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
|
||||
clf_ws = None
|
||||
for n_estimators in [5, 10]:
|
||||
if clf_ws is None:
|
||||
clf_ws = BaggingClassifier(
|
||||
n_estimators=n_estimators, random_state=random_state, warm_start=True
|
||||
)
|
||||
else:
|
||||
clf_ws.set_params(n_estimators=n_estimators)
|
||||
clf_ws.fit(X, y)
|
||||
assert len(clf_ws) == n_estimators
|
||||
|
||||
clf_no_ws = BaggingClassifier(
|
||||
n_estimators=10, random_state=random_state, warm_start=False
|
||||
)
|
||||
clf_no_ws.fit(X, y)
|
||||
|
||||
assert set([tree.random_state for tree in clf_ws]) == set(
|
||||
[tree.random_state for tree in clf_no_ws]
|
||||
)
|
||||
|
||||
|
||||
def test_warm_start_smaller_n_estimators():
|
||||
# Test if warm start'ed second fit with smaller n_estimators raises error.
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
clf = BaggingClassifier(n_estimators=5, warm_start=True)
|
||||
clf.fit(X, y)
|
||||
clf.set_params(n_estimators=4)
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_warm_start_equal_n_estimators():
|
||||
# Test that nothing happens when fitting without increasing n_estimators
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
|
||||
|
||||
clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
y_pred = clf.predict(X_test)
|
||||
# modify X to nonsense values, this should not change anything
|
||||
X_train += 1.0
|
||||
|
||||
warn_msg = "Warm-start fitting without increasing n_estimators does not"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
clf.fit(X_train, y_train)
|
||||
assert_array_equal(y_pred, clf.predict(X_test))
|
||||
|
||||
|
||||
def test_warm_start_equivalence():
|
||||
# warm started classifier with 5+5 estimators should be equivalent to
|
||||
# one classifier with 10 estimators
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
|
||||
|
||||
clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141)
|
||||
clf_ws.fit(X_train, y_train)
|
||||
clf_ws.set_params(n_estimators=10)
|
||||
clf_ws.fit(X_train, y_train)
|
||||
y1 = clf_ws.predict(X_test)
|
||||
|
||||
clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141)
|
||||
clf.fit(X_train, y_train)
|
||||
y2 = clf.predict(X_test)
|
||||
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
|
||||
def test_warm_start_with_oob_score_fails():
|
||||
# Check using oob_score and warm_start simultaneously fails
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_oob_score_removed_on_warm_start():
|
||||
X, y = make_hastie_10_2(n_samples=100, random_state=1)
|
||||
|
||||
clf = BaggingClassifier(n_estimators=5, oob_score=True)
|
||||
clf.fit(X, y)
|
||||
|
||||
clf.set_params(warm_start=True, oob_score=False, n_estimators=10)
|
||||
clf.fit(X, y)
|
||||
|
||||
with pytest.raises(AttributeError):
|
||||
getattr(clf, "oob_score_")
|
||||
|
||||
|
||||
def test_oob_score_consistency():
|
||||
# Make sure OOB scores are identical when random_state, estimator, and
|
||||
# training data are fixed and fitting is done twice
|
||||
X, y = make_hastie_10_2(n_samples=200, random_state=1)
|
||||
bagging = BaggingClassifier(
|
||||
KNeighborsClassifier(),
|
||||
max_samples=0.5,
|
||||
max_features=0.5,
|
||||
oob_score=True,
|
||||
random_state=1,
|
||||
)
|
||||
assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
|
||||
|
||||
|
||||
def test_estimators_samples():
|
||||
# Check that format of estimators_samples_ is correct and that results
|
||||
# generated at fit time can be identically reproduced at a later time
|
||||
# using data saved in object attributes.
|
||||
X, y = make_hastie_10_2(n_samples=200, random_state=1)
|
||||
bagging = BaggingClassifier(
|
||||
LogisticRegression(),
|
||||
max_samples=0.5,
|
||||
max_features=0.5,
|
||||
random_state=1,
|
||||
bootstrap=False,
|
||||
)
|
||||
bagging.fit(X, y)
|
||||
|
||||
# Get relevant attributes
|
||||
estimators_samples = bagging.estimators_samples_
|
||||
estimators_features = bagging.estimators_features_
|
||||
estimators = bagging.estimators_
|
||||
|
||||
# Test for correct formatting
|
||||
assert len(estimators_samples) == len(estimators)
|
||||
assert len(estimators_samples[0]) == len(X) // 2
|
||||
assert estimators_samples[0].dtype.kind == "i"
|
||||
|
||||
# Re-fit single estimator to test for consistent sampling
|
||||
estimator_index = 0
|
||||
estimator_samples = estimators_samples[estimator_index]
|
||||
estimator_features = estimators_features[estimator_index]
|
||||
estimator = estimators[estimator_index]
|
||||
|
||||
X_train = (X[estimator_samples])[:, estimator_features]
|
||||
y_train = y[estimator_samples]
|
||||
|
||||
orig_coefs = estimator.coef_
|
||||
estimator.fit(X_train, y_train)
|
||||
new_coefs = estimator.coef_
|
||||
|
||||
assert_array_almost_equal(orig_coefs, new_coefs)
|
||||
|
||||
|
||||
def test_estimators_samples_deterministic():
|
||||
# This test is a regression test to check that with a random step
|
||||
# (e.g. SparseRandomProjection) and a given random state, the results
|
||||
# generated at fit time can be identically reproduced at a later time using
|
||||
# data saved in object attributes. Check issue #9524 for full discussion.
|
||||
|
||||
iris = load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
|
||||
base_pipeline = make_pipeline(
|
||||
SparseRandomProjection(n_components=2), LogisticRegression()
|
||||
)
|
||||
clf = BaggingClassifier(estimator=base_pipeline, max_samples=0.5, random_state=0)
|
||||
clf.fit(X, y)
|
||||
pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()
|
||||
|
||||
estimator = clf.estimators_[0]
|
||||
estimator_sample = clf.estimators_samples_[0]
|
||||
estimator_feature = clf.estimators_features_[0]
|
||||
|
||||
X_train = (X[estimator_sample])[:, estimator_feature]
|
||||
y_train = y[estimator_sample]
|
||||
|
||||
estimator.fit(X_train, y_train)
|
||||
assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
|
||||
|
||||
|
||||
def test_max_samples_consistency():
|
||||
# Make sure validated max_samples and original max_samples are identical
|
||||
# when valid integer max_samples supplied by user
|
||||
max_samples = 100
|
||||
X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1)
|
||||
bagging = BaggingClassifier(
|
||||
KNeighborsClassifier(),
|
||||
max_samples=max_samples,
|
||||
max_features=0.5,
|
||||
random_state=1,
|
||||
)
|
||||
bagging.fit(X, y)
|
||||
assert bagging._max_samples == max_samples
|
||||
|
||||
|
||||
def test_set_oob_score_label_encoding():
|
||||
# Make sure the oob_score doesn't change when the labels change
|
||||
# See: https://github.com/scikit-learn/scikit-learn/issues/8933
|
||||
random_state = 5
|
||||
X = [[-1], [0], [1]] * 5
|
||||
Y1 = ["A", "B", "C"] * 5
|
||||
Y2 = [-1, 0, 1] * 5
|
||||
Y3 = [0, 1, 2] * 5
|
||||
x1 = (
|
||||
BaggingClassifier(oob_score=True, random_state=random_state)
|
||||
.fit(X, Y1)
|
||||
.oob_score_
|
||||
)
|
||||
x2 = (
|
||||
BaggingClassifier(oob_score=True, random_state=random_state)
|
||||
.fit(X, Y2)
|
||||
.oob_score_
|
||||
)
|
||||
x3 = (
|
||||
BaggingClassifier(oob_score=True, random_state=random_state)
|
||||
.fit(X, Y3)
|
||||
.oob_score_
|
||||
)
|
||||
assert [x1, x2] == [x3, x3]
|
||||
|
||||
|
||||
def replace(X):
|
||||
X = X.astype("float", copy=True)
|
||||
X[~np.isfinite(X)] = 0
|
||||
return X
|
||||
|
||||
|
||||
def test_bagging_regressor_with_missing_inputs():
|
||||
# Check that BaggingRegressor can accept X with missing/infinite data
|
||||
X = np.array(
|
||||
[
|
||||
[1, 3, 5],
|
||||
[2, None, 6],
|
||||
[2, np.nan, 6],
|
||||
[2, np.inf, 6],
|
||||
[2, -np.inf, 6],
|
||||
]
|
||||
)
|
||||
y_values = [
|
||||
np.array([2, 3, 3, 3, 3]),
|
||||
np.array(
|
||||
[
|
||||
[2, 1, 9],
|
||||
[3, 6, 8],
|
||||
[3, 6, 8],
|
||||
[3, 6, 8],
|
||||
[3, 6, 8],
|
||||
]
|
||||
),
|
||||
]
|
||||
for y in y_values:
|
||||
regressor = DecisionTreeRegressor()
|
||||
pipeline = make_pipeline(FunctionTransformer(replace), regressor)
|
||||
pipeline.fit(X, y).predict(X)
|
||||
bagging_regressor = BaggingRegressor(pipeline)
|
||||
y_hat = bagging_regressor.fit(X, y).predict(X)
|
||||
assert y.shape == y_hat.shape
|
||||
|
||||
# Verify that exceptions can be raised by wrapper regressor
|
||||
regressor = DecisionTreeRegressor()
|
||||
pipeline = make_pipeline(regressor)
|
||||
with pytest.raises(ValueError):
|
||||
pipeline.fit(X, y)
|
||||
bagging_regressor = BaggingRegressor(pipeline)
|
||||
with pytest.raises(ValueError):
|
||||
bagging_regressor.fit(X, y)
|
||||
|
||||
|
||||
def test_bagging_classifier_with_missing_inputs():
|
||||
# Check that BaggingClassifier can accept X with missing/infinite data
|
||||
X = np.array(
|
||||
[
|
||||
[1, 3, 5],
|
||||
[2, None, 6],
|
||||
[2, np.nan, 6],
|
||||
[2, np.inf, 6],
|
||||
[2, -np.inf, 6],
|
||||
]
|
||||
)
|
||||
y = np.array([3, 6, 6, 6, 6])
|
||||
classifier = DecisionTreeClassifier()
|
||||
pipeline = make_pipeline(FunctionTransformer(replace), classifier)
|
||||
pipeline.fit(X, y).predict(X)
|
||||
bagging_classifier = BaggingClassifier(pipeline)
|
||||
bagging_classifier.fit(X, y)
|
||||
y_hat = bagging_classifier.predict(X)
|
||||
assert y.shape == y_hat.shape
|
||||
bagging_classifier.predict_log_proba(X)
|
||||
bagging_classifier.predict_proba(X)
|
||||
|
||||
# Verify that exceptions can be raised by wrapper classifier
|
||||
classifier = DecisionTreeClassifier()
|
||||
pipeline = make_pipeline(classifier)
|
||||
with pytest.raises(ValueError):
|
||||
pipeline.fit(X, y)
|
||||
bagging_classifier = BaggingClassifier(pipeline)
|
||||
with pytest.raises(ValueError):
|
||||
bagging_classifier.fit(X, y)
|
||||
|
||||
|
||||
def test_bagging_small_max_features():
|
||||
# Check that Bagging estimator can accept low fractional max_features
|
||||
|
||||
X = np.array([[1, 2], [3, 4]])
|
||||
y = np.array([1, 0])
|
||||
|
||||
bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1)
|
||||
bagging.fit(X, y)
|
||||
|
||||
|
||||
def test_bagging_get_estimators_indices():
|
||||
# Check that Bagging estimator can generate sample indices properly
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16436
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(13, 4)
|
||||
y = np.arange(13)
|
||||
|
||||
class MyEstimator(DecisionTreeRegressor):
|
||||
"""An estimator which stores y indices information at fit."""
|
||||
|
||||
def fit(self, X, y):
|
||||
self._sample_indices = y
|
||||
|
||||
clf = BaggingRegressor(estimator=MyEstimator(), n_estimators=1, random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_array_equal(clf.estimators_[0]._sample_indices, clf.estimators_samples_[0])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bagging, expected_allow_nan",
|
||||
[
|
||||
(BaggingClassifier(HistGradientBoostingClassifier(max_iter=1)), True),
|
||||
(BaggingRegressor(HistGradientBoostingRegressor(max_iter=1)), True),
|
||||
(BaggingClassifier(LogisticRegression()), False),
|
||||
(BaggingRegressor(SVR()), False),
|
||||
],
|
||||
)
|
||||
def test_bagging_allow_nan_tag(bagging, expected_allow_nan):
|
||||
"""Check that bagging inherits allow_nan tag."""
|
||||
assert bagging._get_tags()["allow_nan"] == expected_allow_nan
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
BaggingClassifier(
|
||||
estimator=RandomForestClassifier(n_estimators=1), n_estimators=1
|
||||
),
|
||||
BaggingRegressor(
|
||||
estimator=RandomForestRegressor(n_estimators=1), n_estimators=1
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_bagging_with_metadata_routing(model):
|
||||
"""Make sure that metadata routing works with non-default estimator."""
|
||||
with sklearn.config_context(enable_metadata_routing=True):
|
||||
model.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
BaggingClassifier(
|
||||
estimator=AdaBoostClassifier(n_estimators=1, algorithm="SAMME"),
|
||||
n_estimators=1,
|
||||
),
|
||||
BaggingRegressor(estimator=AdaBoostRegressor(n_estimators=1), n_estimators=1),
|
||||
],
|
||||
)
|
||||
def test_bagging_without_support_metadata_routing(model):
|
||||
"""Make sure that we still can use an estimator that does not implement the
|
||||
metadata routing."""
|
||||
model.fit(iris.data, iris.target)
|
||||
@ -0,0 +1,109 @@
|
||||
"""
|
||||
Testing for the base module (sklearn.ensemble.base).
|
||||
"""
|
||||
|
||||
# Authors: Gilles Louppe
|
||||
# License: BSD 3 clause
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.ensemble import BaggingClassifier
|
||||
from sklearn.ensemble._base import _set_random_states
|
||||
from sklearn.feature_selection import SelectFromModel
|
||||
from sklearn.linear_model import Perceptron
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
|
||||
def test_base():
|
||||
# Check BaseEnsemble methods.
|
||||
ensemble = BaggingClassifier(
|
||||
estimator=Perceptron(random_state=None), n_estimators=3
|
||||
)
|
||||
|
||||
iris = load_iris()
|
||||
ensemble.fit(iris.data, iris.target)
|
||||
ensemble.estimators_ = [] # empty the list and create estimators manually
|
||||
|
||||
ensemble._make_estimator()
|
||||
random_state = np.random.RandomState(3)
|
||||
ensemble._make_estimator(random_state=random_state)
|
||||
ensemble._make_estimator(random_state=random_state)
|
||||
ensemble._make_estimator(append=False)
|
||||
|
||||
assert 3 == len(ensemble)
|
||||
assert 3 == len(ensemble.estimators_)
|
||||
|
||||
assert isinstance(ensemble[0], Perceptron)
|
||||
assert ensemble[0].random_state is None
|
||||
assert isinstance(ensemble[1].random_state, int)
|
||||
assert isinstance(ensemble[2].random_state, int)
|
||||
assert ensemble[1].random_state != ensemble[2].random_state
|
||||
|
||||
np_int_ensemble = BaggingClassifier(
|
||||
estimator=Perceptron(), n_estimators=np.int32(3)
|
||||
)
|
||||
np_int_ensemble.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
def test_set_random_states():
|
||||
# Linear Discriminant Analysis doesn't have random state: smoke test
|
||||
_set_random_states(LinearDiscriminantAnalysis(), random_state=17)
|
||||
|
||||
clf1 = Perceptron(random_state=None)
|
||||
assert clf1.random_state is None
|
||||
# check random_state is None still sets
|
||||
_set_random_states(clf1, None)
|
||||
assert isinstance(clf1.random_state, int)
|
||||
|
||||
# check random_state fixes results in consistent initialisation
|
||||
_set_random_states(clf1, 3)
|
||||
assert isinstance(clf1.random_state, int)
|
||||
clf2 = Perceptron(random_state=None)
|
||||
_set_random_states(clf2, 3)
|
||||
assert clf1.random_state == clf2.random_state
|
||||
|
||||
# nested random_state
|
||||
|
||||
def make_steps():
|
||||
return [
|
||||
("sel", SelectFromModel(Perceptron(random_state=None))),
|
||||
("clf", Perceptron(random_state=None)),
|
||||
]
|
||||
|
||||
est1 = Pipeline(make_steps())
|
||||
_set_random_states(est1, 3)
|
||||
assert isinstance(est1.steps[0][1].estimator.random_state, int)
|
||||
assert isinstance(est1.steps[1][1].random_state, int)
|
||||
assert (
|
||||
est1.get_params()["sel__estimator__random_state"]
|
||||
!= est1.get_params()["clf__random_state"]
|
||||
)
|
||||
|
||||
# ensure multiple random_state parameters are invariant to get_params()
|
||||
# iteration order
|
||||
|
||||
class AlphaParamPipeline(Pipeline):
|
||||
def get_params(self, *args, **kwargs):
|
||||
params = Pipeline.get_params(self, *args, **kwargs).items()
|
||||
return OrderedDict(sorted(params))
|
||||
|
||||
class RevParamPipeline(Pipeline):
|
||||
def get_params(self, *args, **kwargs):
|
||||
params = Pipeline.get_params(self, *args, **kwargs).items()
|
||||
return OrderedDict(sorted(params, reverse=True))
|
||||
|
||||
for cls in [AlphaParamPipeline, RevParamPipeline]:
|
||||
est2 = cls(make_steps())
|
||||
_set_random_states(est2, 3)
|
||||
assert (
|
||||
est1.get_params()["sel__estimator__random_state"]
|
||||
== est2.get_params()["sel__estimator__random_state"]
|
||||
)
|
||||
assert (
|
||||
est1.get_params()["clf__random_state"]
|
||||
== est2.get_params()["clf__random_state"]
|
||||
)
|
||||
@ -0,0 +1,262 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import ClassifierMixin, clone, is_classifier
|
||||
from sklearn.datasets import (
|
||||
load_diabetes,
|
||||
load_iris,
|
||||
make_classification,
|
||||
make_regression,
|
||||
)
|
||||
from sklearn.ensemble import (
|
||||
RandomForestClassifier,
|
||||
RandomForestRegressor,
|
||||
StackingClassifier,
|
||||
StackingRegressor,
|
||||
VotingClassifier,
|
||||
VotingRegressor,
|
||||
)
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.linear_model import LinearRegression, LogisticRegression
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
|
||||
X_r, y_r = load_diabetes(return_X_y=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, estimator",
|
||||
[
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC()),
|
||||
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
|
||||
],
|
||||
cv=2,
|
||||
),
|
||||
),
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC()),
|
||||
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR()),
|
||||
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
|
||||
],
|
||||
cv=2,
|
||||
),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR()),
|
||||
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"stacking-classifier",
|
||||
"voting-classifier",
|
||||
"stacking-regressor",
|
||||
"voting-regressor",
|
||||
],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
|
||||
# check that the behavior of `estimators`, `estimators_`,
|
||||
# `named_estimators`, `named_estimators_` is consistent across all
|
||||
# ensemble classes and when using `set_params()`.
|
||||
|
||||
# before fit
|
||||
assert "svm" in estimator.named_estimators
|
||||
assert estimator.named_estimators.svm is estimator.estimators[1][1]
|
||||
assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
|
||||
|
||||
# check fitted attributes
|
||||
estimator.fit(X, y)
|
||||
assert len(estimator.named_estimators) == 3
|
||||
assert len(estimator.named_estimators_) == 3
|
||||
assert sorted(list(estimator.named_estimators_.keys())) == sorted(
|
||||
["lr", "svm", "rf"]
|
||||
)
|
||||
|
||||
# check that set_params() does not add a new attribute
|
||||
estimator_new_params = clone(estimator)
|
||||
svm_estimator = SVC() if is_classifier(estimator) else SVR()
|
||||
estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
|
||||
assert not hasattr(estimator_new_params, "svm")
|
||||
assert (
|
||||
estimator_new_params.named_estimators.lr.get_params()
|
||||
== estimator.named_estimators.lr.get_params()
|
||||
)
|
||||
assert (
|
||||
estimator_new_params.named_estimators.rf.get_params()
|
||||
== estimator.named_estimators.rf.get_params()
|
||||
)
|
||||
|
||||
# check the behavior when setting an dropping an estimator
|
||||
estimator_dropped = clone(estimator)
|
||||
estimator_dropped.set_params(svm="drop")
|
||||
estimator_dropped.fit(X, y)
|
||||
assert len(estimator_dropped.named_estimators) == 3
|
||||
assert estimator_dropped.named_estimators.svm == "drop"
|
||||
assert len(estimator_dropped.named_estimators_) == 3
|
||||
assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
|
||||
["lr", "svm", "rf"]
|
||||
)
|
||||
for sub_est in estimator_dropped.named_estimators_:
|
||||
# check that the correspondence is correct
|
||||
assert not isinstance(sub_est, type(estimator.named_estimators.svm))
|
||||
|
||||
# check that we can set the parameters of the underlying classifier
|
||||
estimator.set_params(svm__C=10.0)
|
||||
estimator.set_params(rf__max_depth=5)
|
||||
assert (
|
||||
estimator.get_params()["svm__C"]
|
||||
== estimator.get_params()["svm"].get_params()["C"]
|
||||
)
|
||||
assert (
|
||||
estimator.get_params()["rf__max_depth"]
|
||||
== estimator.get_params()["rf"].get_params()["max_depth"]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Ensemble",
|
||||
[VotingClassifier, StackingRegressor, VotingRegressor],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_type(Ensemble):
|
||||
# check that ensemble will fail during validation if the underlying
|
||||
# estimators are not of the same type (i.e. classifier or regressor)
|
||||
# StackingClassifier can have an underlying regresor so it's not checked
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
X, y = make_classification(n_samples=10)
|
||||
estimators = [("lr", LinearRegression())]
|
||||
ensemble_type = "classifier"
|
||||
else:
|
||||
X, y = make_regression(n_samples=10)
|
||||
estimators = [("lr", LogisticRegression())]
|
||||
ensemble_type = "regressor"
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = "should be a {}".format(ensemble_type)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, Ensemble",
|
||||
[
|
||||
(*make_classification(n_samples=10), StackingClassifier),
|
||||
(*make_classification(n_samples=10), VotingClassifier),
|
||||
(*make_regression(n_samples=10), StackingRegressor),
|
||||
(*make_regression(n_samples=10), VotingRegressor),
|
||||
],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
|
||||
# raise an error when the name contains dunder
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [("lr__", LogisticRegression())]
|
||||
else:
|
||||
estimators = [("lr__", LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = r"Estimator names must not contain __: got \['lr__'\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
# raise an error when the name is not unique
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
|
||||
else:
|
||||
estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
# raise an error when the name conflicts with the parameters
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [("estimators", LogisticRegression())]
|
||||
else:
|
||||
estimators = [("estimators", LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = "Estimator names conflict with constructor arguments"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, estimator",
|
||||
[
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
StackingClassifier(estimators=[("lr", LogisticRegression())]),
|
||||
),
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
VotingClassifier(estimators=[("lr", LogisticRegression())]),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
StackingRegressor(estimators=[("lr", LinearRegression())]),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
VotingRegressor(estimators=[("lr", LinearRegression())]),
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"stacking-classifier",
|
||||
"voting-classifier",
|
||||
"stacking-regressor",
|
||||
"voting-regressor",
|
||||
],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
|
||||
# check that we raise a consistent error when all estimators are
|
||||
# dropped
|
||||
estimator.set_params(lr="drop")
|
||||
with pytest.raises(ValueError, match="All estimators are dropped."):
|
||||
estimator.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Ensemble, Estimator, X, y",
|
||||
[
|
||||
(StackingClassifier, LogisticRegression, X, y),
|
||||
(StackingRegressor, LinearRegression, X_r, y_r),
|
||||
(VotingClassifier, LogisticRegression, X, y),
|
||||
(VotingRegressor, LinearRegression, X_r, y_r),
|
||||
],
|
||||
)
|
||||
# FIXME: we should move this test in `estimator_checks` once we are able
|
||||
# to construct meta-estimator instances
|
||||
def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
|
||||
# check that Voting and Stacking predictor delegate the missing values
|
||||
# validation to the underlying estimator.
|
||||
X = X.copy()
|
||||
mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
|
||||
X[mask] = np.nan
|
||||
pipe = make_pipeline(SimpleImputer(), Estimator())
|
||||
ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
|
||||
ensemble.fit(X, y).score(X, y)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,363 @@
|
||||
"""
|
||||
Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
|
||||
"""
|
||||
|
||||
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import load_diabetes, load_iris, make_classification
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.ensemble._iforest import _average_path_length
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.model_selection import ParameterGrid, train_test_split
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
# load iris & diabetes dataset
|
||||
iris = load_iris()
|
||||
diabetes = load_diabetes()
|
||||
|
||||
|
||||
def test_iforest(global_random_seed):
|
||||
"""Check Isolation Forest for various parameter settings."""
|
||||
X_train = np.array([[0, 1], [1, 2]])
|
||||
X_test = np.array([[2, 1], [1, 1]])
|
||||
|
||||
grid = ParameterGrid(
|
||||
{"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]}
|
||||
)
|
||||
|
||||
with ignore_warnings():
|
||||
for params in grid:
|
||||
IsolationForest(random_state=global_random_seed, **params).fit(
|
||||
X_train
|
||||
).predict(X_test)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_iforest_sparse(global_random_seed, sparse_container):
|
||||
"""Check IForest for various parameter settings on sparse input."""
|
||||
rng = check_random_state(global_random_seed)
|
||||
X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng)
|
||||
grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]})
|
||||
|
||||
X_train_sparse = sparse_container(X_train)
|
||||
X_test_sparse = sparse_container(X_test)
|
||||
|
||||
for params in grid:
|
||||
# Trained on sparse format
|
||||
sparse_classifier = IsolationForest(
|
||||
n_estimators=10, random_state=global_random_seed, **params
|
||||
).fit(X_train_sparse)
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = IsolationForest(
|
||||
n_estimators=10, random_state=global_random_seed, **params
|
||||
).fit(X_train)
|
||||
dense_results = dense_classifier.predict(X_test)
|
||||
|
||||
assert_array_equal(sparse_results, dense_results)
|
||||
|
||||
|
||||
def test_iforest_error():
|
||||
"""Test that it gives proper exception on deficient input."""
|
||||
X = iris.data
|
||||
|
||||
# The dataset has less than 256 samples, explicitly setting
|
||||
# max_samples > n_samples should result in a warning. If not set
|
||||
# explicitly there should be no warning
|
||||
warn_msg = "max_samples will be set to n_samples for estimation"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
IsolationForest(max_samples=1000).fit(X)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
IsolationForest(max_samples="auto").fit(X)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
IsolationForest(max_samples=np.int64(2)).fit(X)
|
||||
|
||||
# test X_test n_features match X_train one:
|
||||
with pytest.raises(ValueError):
|
||||
IsolationForest().fit(X).predict(X[:, 1:])
|
||||
|
||||
|
||||
def test_recalculate_max_depth():
|
||||
"""Check max_depth recalculation when max_samples is reset to n_samples"""
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
for est in clf.estimators_:
|
||||
assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
|
||||
|
||||
|
||||
def test_max_samples_attribute():
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
assert clf.max_samples_ == X.shape[0]
|
||||
|
||||
clf = IsolationForest(max_samples=500)
|
||||
warn_msg = "max_samples will be set to n_samples for estimation"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
clf.fit(X)
|
||||
assert clf.max_samples_ == X.shape[0]
|
||||
|
||||
clf = IsolationForest(max_samples=0.4).fit(X)
|
||||
assert clf.max_samples_ == 0.4 * X.shape[0]
|
||||
|
||||
|
||||
def test_iforest_parallel_regression(global_random_seed):
|
||||
"""Check parallel regression."""
|
||||
rng = check_random_state(global_random_seed)
|
||||
|
||||
X_train, X_test = train_test_split(diabetes.data, random_state=rng)
|
||||
|
||||
ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)
|
||||
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y1 = ensemble.predict(X_test)
|
||||
ensemble.set_params(n_jobs=2)
|
||||
y2 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train)
|
||||
|
||||
y3 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
|
||||
def test_iforest_performance(global_random_seed):
|
||||
"""Test Isolation Forest performs well"""
|
||||
|
||||
# Generate train/test data
|
||||
rng = check_random_state(global_random_seed)
|
||||
X = 0.3 * rng.randn(600, 2)
|
||||
X = rng.permutation(np.vstack((X + 2, X - 2)))
|
||||
X_train = X[:1000]
|
||||
|
||||
# Generate some abnormal novel observations
|
||||
X_outliers = rng.uniform(low=-1, high=1, size=(200, 2))
|
||||
X_test = np.vstack((X[1000:], X_outliers))
|
||||
y_test = np.array([0] * 200 + [1] * 200)
|
||||
|
||||
# fit the model
|
||||
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
|
||||
|
||||
# predict scores (the lower, the more normal)
|
||||
y_pred = -clf.decision_function(X_test)
|
||||
|
||||
# check that there is at most 6 errors (false positive or false negative)
|
||||
assert roc_auc_score(y_test, y_pred) > 0.98
|
||||
|
||||
|
||||
@pytest.mark.parametrize("contamination", [0.25, "auto"])
|
||||
def test_iforest_works(contamination, global_random_seed):
|
||||
# toy sample (the last two samples are outliers)
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
|
||||
|
||||
# Test IsolationForest
|
||||
clf = IsolationForest(random_state=global_random_seed, contamination=contamination)
|
||||
clf.fit(X)
|
||||
decision_func = -clf.decision_function(X)
|
||||
pred = clf.predict(X)
|
||||
# assert detect outliers:
|
||||
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
|
||||
assert_array_equal(pred, 6 * [1] + 2 * [-1])
|
||||
|
||||
|
||||
def test_max_samples_consistency():
|
||||
# Make sure validated max_samples in iforest and BaseBagging are identical
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
assert clf.max_samples_ == clf._max_samples
|
||||
|
||||
|
||||
def test_iforest_subsampled_features():
|
||||
# It tests non-regression for #5732 which failed at predict.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data[:50], diabetes.target[:50], random_state=rng
|
||||
)
|
||||
clf = IsolationForest(max_features=0.8)
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
|
||||
|
||||
def test_iforest_average_path_length():
|
||||
# It tests non-regression for #8549 which used the wrong formula
|
||||
# for average path length, strictly for the integer case
|
||||
# Updated to check average path length when input is <= 2 (issue #11839)
|
||||
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
|
||||
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
|
||||
assert_allclose(_average_path_length([0]), [0.0])
|
||||
assert_allclose(_average_path_length([1]), [0.0])
|
||||
assert_allclose(_average_path_length([2]), [1.0])
|
||||
assert_allclose(_average_path_length([5]), [result_one])
|
||||
assert_allclose(_average_path_length([999]), [result_two])
|
||||
assert_allclose(
|
||||
_average_path_length(np.array([1, 2, 5, 999])),
|
||||
[0.0, 1.0, result_one, result_two],
|
||||
)
|
||||
# _average_path_length is increasing
|
||||
avg_path_length = _average_path_length(np.arange(5))
|
||||
assert_array_equal(avg_path_length, np.sort(avg_path_length))
|
||||
|
||||
|
||||
def test_score_samples():
|
||||
X_train = [[1, 1], [1, 2], [2, 1]]
|
||||
clf1 = IsolationForest(contamination=0.1).fit(X_train)
|
||||
clf2 = IsolationForest().fit(X_train)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]),
|
||||
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf2.score_samples([[2.0, 2.0]]),
|
||||
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
|
||||
)
|
||||
|
||||
|
||||
def test_iforest_warm_start():
|
||||
"""Test iterative addition of iTrees to an iForest"""
|
||||
|
||||
rng = check_random_state(0)
|
||||
X = rng.randn(20, 2)
|
||||
|
||||
# fit first 10 trees
|
||||
clf = IsolationForest(
|
||||
n_estimators=10, max_samples=20, random_state=rng, warm_start=True
|
||||
)
|
||||
clf.fit(X)
|
||||
# remember the 1st tree
|
||||
tree_1 = clf.estimators_[0]
|
||||
# fit another 10 trees
|
||||
clf.set_params(n_estimators=20)
|
||||
clf.fit(X)
|
||||
# expecting 20 fitted trees and no overwritten trees
|
||||
assert len(clf.estimators_) == 20
|
||||
assert clf.estimators_[0] is tree_1
|
||||
|
||||
|
||||
# mock get_chunk_n_rows to actually test more than one chunk (here one
|
||||
# chunk has 3 rows):
|
||||
@patch(
|
||||
"sklearn.ensemble._iforest.get_chunk_n_rows",
|
||||
side_effect=Mock(**{"return_value": 3}),
|
||||
)
|
||||
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
|
||||
def test_iforest_chunks_works1(
|
||||
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
|
||||
):
|
||||
test_iforest_works(contamination, global_random_seed)
|
||||
assert mocked_get_chunk.call_count == n_predict_calls
|
||||
|
||||
|
||||
# idem with chunk_size = 10 rows
|
||||
@patch(
|
||||
"sklearn.ensemble._iforest.get_chunk_n_rows",
|
||||
side_effect=Mock(**{"return_value": 10}),
|
||||
)
|
||||
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
|
||||
def test_iforest_chunks_works2(
|
||||
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
|
||||
):
|
||||
test_iforest_works(contamination, global_random_seed)
|
||||
assert mocked_get_chunk.call_count == n_predict_calls
|
||||
|
||||
|
||||
def test_iforest_with_uniform_data():
|
||||
"""Test whether iforest predicts inliers when using uniform data"""
|
||||
|
||||
# 2-d array of all 1s
|
||||
X = np.ones((100, 10))
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(X + 1) == 1)
|
||||
assert all(iforest.predict(X - 1) == 1)
|
||||
|
||||
# 2-d array where columns contain the same value across rows
|
||||
X = np.repeat(rng.randn(1, 10), 100, 0)
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(np.ones((100, 10))) == 1)
|
||||
|
||||
# Single row
|
||||
X = rng.randn(1, 10)
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(np.ones((100, 10))) == 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_iforest_with_n_jobs_does_not_segfault(csc_container):
|
||||
"""Check that Isolation Forest does not segfault with n_jobs=2
|
||||
|
||||
Non-regression test for #23252
|
||||
"""
|
||||
X, _ = make_classification(n_samples=85_000, n_features=100, random_state=0)
|
||||
X = csc_container(X)
|
||||
IsolationForest(n_estimators=10, max_samples=256, n_jobs=2).fit(X)
|
||||
|
||||
|
||||
def test_iforest_preserve_feature_names():
|
||||
"""Check that feature names are preserved when contamination is not "auto".
|
||||
|
||||
Feature names are required for consistency checks during scoring.
|
||||
|
||||
Non-regression test for Issue #25844
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = pd.DataFrame(data=rng.randn(4), columns=["a"])
|
||||
model = IsolationForest(random_state=0, contamination=0.05)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
model.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_iforest_sparse_input_float_contamination(sparse_container):
|
||||
"""Check that `IsolationForest` accepts sparse matrix input and float value for
|
||||
contamination.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/27626
|
||||
"""
|
||||
X, _ = make_classification(n_samples=50, n_features=4, random_state=0)
|
||||
X = sparse_container(X)
|
||||
X.sort_indices()
|
||||
contamination = 0.1
|
||||
iforest = IsolationForest(
|
||||
n_estimators=5, contamination=contamination, random_state=0
|
||||
).fit(X)
|
||||
|
||||
X_decision = iforest.decision_function(X)
|
||||
assert (X_decision < 0).sum() / X.shape[0] == pytest.approx(contamination)
|
||||
@ -0,0 +1,890 @@
|
||||
"""Test the stacking classifier and regressor."""
|
||||
|
||||
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
|
||||
from sklearn.datasets import (
|
||||
load_breast_cancer,
|
||||
load_diabetes,
|
||||
load_iris,
|
||||
make_classification,
|
||||
make_multilabel_classification,
|
||||
make_regression,
|
||||
)
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.ensemble import (
|
||||
RandomForestClassifier,
|
||||
RandomForestRegressor,
|
||||
StackingClassifier,
|
||||
StackingRegressor,
|
||||
)
|
||||
from sklearn.exceptions import ConvergenceWarning, NotFittedError
|
||||
from sklearn.linear_model import (
|
||||
LinearRegression,
|
||||
LogisticRegression,
|
||||
Ridge,
|
||||
RidgeClassifier,
|
||||
)
|
||||
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.preprocessing import scale
|
||||
from sklearn.svm import SVC, LinearSVC, LinearSVR
|
||||
from sklearn.utils._mocking import CheckingClassifier
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
ignore_warnings,
|
||||
)
|
||||
from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
diabetes = load_diabetes()
|
||||
X_diabetes, y_diabetes = diabetes.data, diabetes.target
|
||||
iris = load_iris()
|
||||
X_iris, y_iris = iris.data, iris.target
|
||||
X_multilabel, y_multilabel = make_multilabel_classification(
|
||||
n_classes=3, random_state=42
|
||||
)
|
||||
X_binary, y_binary = make_classification(n_classes=2, random_state=42)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"final_estimator", [None, RandomForestClassifier(random_state=42)]
|
||||
)
|
||||
@pytest.mark.parametrize("passthrough", [False, True])
|
||||
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
scale(X_iris), y_iris, stratify=y_iris, random_state=42
|
||||
)
|
||||
estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
|
||||
clf = StackingClassifier(
|
||||
estimators=estimators,
|
||||
final_estimator=final_estimator,
|
||||
cv=cv,
|
||||
passthrough=passthrough,
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
clf.predict_proba(X_test)
|
||||
assert clf.score(X_test, y_test) > 0.8
|
||||
|
||||
X_trans = clf.transform(X_test)
|
||||
expected_column_count = 10 if passthrough else 6
|
||||
assert X_trans.shape[1] == expected_column_count
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -4:])
|
||||
|
||||
clf.set_params(lr="drop")
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
clf.predict_proba(X_test)
|
||||
if final_estimator is None:
|
||||
# LogisticRegression has decision_function method
|
||||
clf.decision_function(X_test)
|
||||
|
||||
X_trans = clf.transform(X_test)
|
||||
expected_column_count_drop = 7 if passthrough else 3
|
||||
assert X_trans.shape[1] == expected_column_count_drop
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -4:])
|
||||
|
||||
|
||||
def test_stacking_classifier_drop_column_binary_classification():
|
||||
# check that a column is dropped in binary classification
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X), y, stratify=y, random_state=42
|
||||
)
|
||||
|
||||
# both classifiers implement 'predict_proba' and will both drop one column
|
||||
estimators = [
|
||||
("lr", LogisticRegression()),
|
||||
("rf", RandomForestClassifier(random_state=42)),
|
||||
]
|
||||
clf = StackingClassifier(estimators=estimators, cv=3)
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert X_trans.shape[1] == 2
|
||||
|
||||
# LinearSVC does not implement 'predict_proba' and will not drop one column
|
||||
estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
|
||||
clf.set_params(estimators=estimators)
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert X_trans.shape[1] == 2
|
||||
|
||||
|
||||
def test_stacking_classifier_drop_estimator():
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X_iris), y_iris, stratify=y_iris, random_state=42
|
||||
)
|
||||
estimators = [("lr", "drop"), ("svc", LinearSVC(random_state=0))]
|
||||
rf = RandomForestClassifier(n_estimators=10, random_state=42)
|
||||
clf = StackingClassifier(
|
||||
estimators=[("svc", LinearSVC(random_state=0))],
|
||||
final_estimator=rf,
|
||||
cv=5,
|
||||
)
|
||||
clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5)
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
clf_drop.fit(X_train, y_train)
|
||||
assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
|
||||
assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
|
||||
assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
|
||||
|
||||
|
||||
def test_stacking_regressor_drop_estimator():
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X_diabetes), y_diabetes, random_state=42
|
||||
)
|
||||
estimators = [("lr", "drop"), ("svr", LinearSVR(random_state=0))]
|
||||
rf = RandomForestRegressor(n_estimators=10, random_state=42)
|
||||
reg = StackingRegressor(
|
||||
estimators=[("svr", LinearSVR(random_state=0))],
|
||||
final_estimator=rf,
|
||||
cv=5,
|
||||
)
|
||||
reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5)
|
||||
|
||||
reg.fit(X_train, y_train)
|
||||
reg_drop.fit(X_train, y_train)
|
||||
assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
|
||||
assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)])
|
||||
@pytest.mark.parametrize(
|
||||
"final_estimator, predict_params",
|
||||
[
|
||||
(None, {}),
|
||||
(RandomForestRegressor(random_state=42), {}),
|
||||
(DummyRegressor(), {"return_std": True}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("passthrough", [False, True])
|
||||
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough):
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X_diabetes), y_diabetes, random_state=42
|
||||
)
|
||||
estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
|
||||
reg = StackingRegressor(
|
||||
estimators=estimators,
|
||||
final_estimator=final_estimator,
|
||||
cv=cv,
|
||||
passthrough=passthrough,
|
||||
)
|
||||
reg.fit(X_train, y_train)
|
||||
result = reg.predict(X_test, **predict_params)
|
||||
expected_result_length = 2 if predict_params else 1
|
||||
if predict_params:
|
||||
assert len(result) == expected_result_length
|
||||
|
||||
X_trans = reg.transform(X_test)
|
||||
expected_column_count = 12 if passthrough else 2
|
||||
assert X_trans.shape[1] == expected_column_count
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -10:])
|
||||
|
||||
reg.set_params(lr="drop")
|
||||
reg.fit(X_train, y_train)
|
||||
reg.predict(X_test)
|
||||
|
||||
X_trans = reg.transform(X_test)
|
||||
expected_column_count_drop = 11 if passthrough else 1
|
||||
assert X_trans.shape[1] == expected_column_count_drop
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -10:])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
|
||||
)
|
||||
def test_stacking_regressor_sparse_passthrough(sparse_container):
|
||||
# Check passthrough behavior on a sparse X matrix
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
sparse_container(scale(X_diabetes)), y_diabetes, random_state=42
|
||||
)
|
||||
estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
|
||||
rf = RandomForestRegressor(n_estimators=10, random_state=42)
|
||||
clf = StackingRegressor(
|
||||
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
|
||||
assert sparse.issparse(X_trans)
|
||||
assert X_test.format == X_trans.format
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
|
||||
)
|
||||
def test_stacking_classifier_sparse_passthrough(sparse_container):
|
||||
# Check passthrough behavior on a sparse X matrix
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
sparse_container(scale(X_iris)), y_iris, random_state=42
|
||||
)
|
||||
estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
|
||||
rf = RandomForestClassifier(n_estimators=10, random_state=42)
|
||||
clf = StackingClassifier(
|
||||
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
|
||||
assert sparse.issparse(X_trans)
|
||||
assert X_test.format == X_trans.format
|
||||
|
||||
|
||||
def test_stacking_classifier_drop_binary_prob():
|
||||
# check that classifier will drop one of the probability column for
|
||||
# binary classification problem
|
||||
|
||||
# Select only the 2 first classes
|
||||
X_, y_ = scale(X_iris[:100]), y_iris[:100]
|
||||
|
||||
estimators = [("lr", LogisticRegression()), ("rf", RandomForestClassifier())]
|
||||
clf = StackingClassifier(estimators=estimators)
|
||||
clf.fit(X_, y_)
|
||||
X_meta = clf.transform(X_)
|
||||
assert X_meta.shape[1] == 2
|
||||
|
||||
|
||||
class NoWeightRegressor(RegressorMixin, BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.reg = DummyRegressor()
|
||||
return self.reg.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
|
||||
class NoWeightClassifier(ClassifierMixin, BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.clf = DummyClassifier(strategy="stratified")
|
||||
return self.clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, params, type_err, msg_err",
|
||||
[
|
||||
(y_iris, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
|
||||
(
|
||||
y_iris,
|
||||
{
|
||||
"estimators": [
|
||||
("lr", LogisticRegression()),
|
||||
("svm", SVC(max_iter=50_000)),
|
||||
],
|
||||
"stack_method": "predict_proba",
|
||||
},
|
||||
ValueError,
|
||||
"does not implement the method predict_proba",
|
||||
),
|
||||
(
|
||||
y_iris,
|
||||
{
|
||||
"estimators": [
|
||||
("lr", LogisticRegression()),
|
||||
("cor", NoWeightClassifier()),
|
||||
]
|
||||
},
|
||||
TypeError,
|
||||
"does not support sample weight",
|
||||
),
|
||||
(
|
||||
y_iris,
|
||||
{
|
||||
"estimators": [
|
||||
("lr", LogisticRegression()),
|
||||
("cor", LinearSVC(max_iter=50_000)),
|
||||
],
|
||||
"final_estimator": NoWeightClassifier(),
|
||||
},
|
||||
TypeError,
|
||||
"does not support sample weight",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stacking_classifier_error(y, params, type_err, msg_err):
|
||||
with pytest.raises(type_err, match=msg_err):
|
||||
clf = StackingClassifier(**params, cv=3)
|
||||
clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, params, type_err, msg_err",
|
||||
[
|
||||
(y_diabetes, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
|
||||
(
|
||||
y_diabetes,
|
||||
{"estimators": [("lr", LinearRegression()), ("cor", NoWeightRegressor())]},
|
||||
TypeError,
|
||||
"does not support sample weight",
|
||||
),
|
||||
(
|
||||
y_diabetes,
|
||||
{
|
||||
"estimators": [
|
||||
("lr", LinearRegression()),
|
||||
("cor", LinearSVR()),
|
||||
],
|
||||
"final_estimator": NoWeightRegressor(),
|
||||
},
|
||||
TypeError,
|
||||
"does not support sample weight",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stacking_regressor_error(y, params, type_err, msg_err):
|
||||
with pytest.raises(type_err, match=msg_err):
|
||||
reg = StackingRegressor(**params, cv=3)
|
||||
reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("svm", LinearSVC(random_state=0)),
|
||||
]
|
||||
),
|
||||
X_iris[:100],
|
||||
y_iris[:100],
|
||||
), # keep only classes 0 and 1
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR(random_state=0)),
|
||||
]
|
||||
),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
ids=["StackingClassifier", "StackingRegressor"],
|
||||
)
|
||||
def test_stacking_randomness(estimator, X, y):
|
||||
# checking that fixing the random state of the CV will lead to the same
|
||||
# results
|
||||
estimator_full = clone(estimator)
|
||||
estimator_full.set_params(
|
||||
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
|
||||
)
|
||||
|
||||
estimator_drop = clone(estimator)
|
||||
estimator_drop.set_params(lr="drop")
|
||||
estimator_drop.set_params(
|
||||
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
|
||||
)
|
||||
|
||||
assert_allclose(
|
||||
estimator_full.fit(X, y).transform(X)[:, 1:],
|
||||
estimator_drop.fit(X, y).transform(X),
|
||||
)
|
||||
|
||||
|
||||
def test_stacking_classifier_stratify_default():
|
||||
# check that we stratify the classes for the default CV
|
||||
clf = StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(max_iter=10_000)),
|
||||
("svm", LinearSVC(max_iter=10_000)),
|
||||
]
|
||||
)
|
||||
# since iris is not shuffled, a simple k-fold would not contain the
|
||||
# 3 classes during training
|
||||
clf.fit(X_iris, y_iris)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC(random_state=42)),
|
||||
],
|
||||
final_estimator=LogisticRegression(),
|
||||
cv=KFold(shuffle=True, random_state=42),
|
||||
),
|
||||
*load_breast_cancer(return_X_y=True),
|
||||
),
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR(random_state=42)),
|
||||
],
|
||||
final_estimator=LinearRegression(),
|
||||
cv=KFold(shuffle=True, random_state=42),
|
||||
),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
ids=["StackingClassifier", "StackingRegressor"],
|
||||
)
|
||||
def test_stacking_with_sample_weight(stacker, X, y):
|
||||
# check that sample weights has an influence on the fitting
|
||||
# note: ConvergenceWarning are catch since we are not worrying about the
|
||||
# convergence here
|
||||
n_half_samples = len(y) // 2
|
||||
total_sample_weight = np.array(
|
||||
[0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)
|
||||
)
|
||||
X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split(
|
||||
X, y, total_sample_weight, random_state=42
|
||||
)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
stacker.fit(X_train, y_train)
|
||||
y_pred_no_weight = stacker.predict(X_test)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape))
|
||||
y_pred_unit_weight = stacker.predict(X_test)
|
||||
|
||||
assert_allclose(y_pred_no_weight, y_pred_unit_weight)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
stacker.fit(X_train, y_train, sample_weight=sample_weight_train)
|
||||
y_pred_biased = stacker.predict(X_test)
|
||||
|
||||
assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
|
||||
|
||||
|
||||
def test_stacking_classifier_sample_weight_fit_param():
|
||||
# check sample_weight is passed to all invocations of fit
|
||||
stacker = StackingClassifier(
|
||||
estimators=[("lr", CheckingClassifier(expected_sample_weight=True))],
|
||||
final_estimator=CheckingClassifier(expected_sample_weight=True),
|
||||
)
|
||||
stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0]))
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC(random_state=42)),
|
||||
],
|
||||
final_estimator=LogisticRegression(),
|
||||
),
|
||||
*load_breast_cancer(return_X_y=True),
|
||||
),
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR(random_state=42)),
|
||||
],
|
||||
final_estimator=LinearRegression(),
|
||||
),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
ids=["StackingClassifier", "StackingRegressor"],
|
||||
)
|
||||
def test_stacking_cv_influence(stacker, X, y):
|
||||
# check that the stacking affects the fit of the final estimator but not
|
||||
# the fit of the base estimators
|
||||
# note: ConvergenceWarning are catch since we are not worrying about the
|
||||
# convergence here
|
||||
stacker_cv_3 = clone(stacker)
|
||||
stacker_cv_5 = clone(stacker)
|
||||
|
||||
stacker_cv_3.set_params(cv=3)
|
||||
stacker_cv_5.set_params(cv=5)
|
||||
|
||||
stacker_cv_3.fit(X, y)
|
||||
stacker_cv_5.fit(X, y)
|
||||
|
||||
# the base estimators should be identical
|
||||
for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_, stacker_cv_5.estimators_):
|
||||
assert_allclose(est_cv_3.coef_, est_cv_5.coef_)
|
||||
|
||||
# the final estimator should be different
|
||||
with pytest.raises(AssertionError, match="Not equal"):
|
||||
assert_allclose(
|
||||
stacker_cv_3.final_estimator_.coef_, stacker_cv_5.final_estimator_.coef_
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Stacker, Estimator, stack_method, final_estimator, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier,
|
||||
DummyClassifier,
|
||||
"predict_proba",
|
||||
LogisticRegression(random_state=42),
|
||||
X_iris,
|
||||
y_iris,
|
||||
),
|
||||
(
|
||||
StackingRegressor,
|
||||
DummyRegressor,
|
||||
"predict",
|
||||
LinearRegression(),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stacking_prefit(Stacker, Estimator, stack_method, final_estimator, X, y):
|
||||
"""Check the behaviour of stacking when `cv='prefit'`"""
|
||||
X_train1, X_train2, y_train1, y_train2 = train_test_split(
|
||||
X, y, random_state=42, test_size=0.5
|
||||
)
|
||||
estimators = [
|
||||
("d0", Estimator().fit(X_train1, y_train1)),
|
||||
("d1", Estimator().fit(X_train1, y_train1)),
|
||||
]
|
||||
|
||||
# mock out fit and stack_method to be asserted later
|
||||
for _, estimator in estimators:
|
||||
estimator.fit = Mock(name="fit")
|
||||
stack_func = getattr(estimator, stack_method)
|
||||
predict_method_mocked = Mock(side_effect=stack_func)
|
||||
# Mocking a method will not provide a `__name__` while Python methods
|
||||
# do and we are using it in `_get_response_method`.
|
||||
predict_method_mocked.__name__ = stack_method
|
||||
setattr(estimator, stack_method, predict_method_mocked)
|
||||
|
||||
stacker = Stacker(
|
||||
estimators=estimators, cv="prefit", final_estimator=final_estimator
|
||||
)
|
||||
stacker.fit(X_train2, y_train2)
|
||||
|
||||
assert stacker.estimators_ == [estimator for _, estimator in estimators]
|
||||
# fit was not called again
|
||||
assert all(estimator.fit.call_count == 0 for estimator in stacker.estimators_)
|
||||
|
||||
# stack method is called with the proper inputs
|
||||
for estimator in stacker.estimators_:
|
||||
stack_func_mock = getattr(estimator, stack_method)
|
||||
stack_func_mock.assert_called_with(X_train2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[("lr", LogisticRegression()), ("svm", SVC())],
|
||||
cv="prefit",
|
||||
),
|
||||
X_iris,
|
||||
y_iris,
|
||||
),
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR()),
|
||||
],
|
||||
cv="prefit",
|
||||
),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stacking_prefit_error(stacker, X, y):
|
||||
# check that NotFittedError is raised
|
||||
# if base estimators are not fitted when cv="prefit"
|
||||
with pytest.raises(NotFittedError):
|
||||
stacker.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"make_dataset, Stacking, Estimator",
|
||||
[
|
||||
(make_classification, StackingClassifier, LogisticRegression),
|
||||
(make_regression, StackingRegressor, LinearRegression),
|
||||
],
|
||||
)
|
||||
def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
|
||||
# Stacking supports estimators without `n_features_in_`. Regression test
|
||||
# for #17353
|
||||
|
||||
class MyEstimator(Estimator):
|
||||
"""Estimator without n_features_in_"""
|
||||
|
||||
def fit(self, X, y):
|
||||
super().fit(X, y)
|
||||
del self.n_features_in_
|
||||
|
||||
X, y = make_dataset(random_state=0, n_samples=100)
|
||||
stacker = Stacking(estimators=[("lr", MyEstimator())])
|
||||
|
||||
msg = f"{Stacking.__name__} object has no attribute n_features_in_"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
stacker.n_features_in_
|
||||
|
||||
# Does not raise
|
||||
stacker.fit(X, y)
|
||||
|
||||
msg = "'MyEstimator' object has no attribute 'n_features_in_'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
stacker.n_features_in_
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
# output a 2D array of the probability of the positive class for each output
|
||||
MLPClassifier(random_state=42),
|
||||
# output a list of 2D array containing the probability of each class
|
||||
# for each output
|
||||
RandomForestClassifier(random_state=42),
|
||||
],
|
||||
ids=["MLPClassifier", "RandomForestClassifier"],
|
||||
)
|
||||
def test_stacking_classifier_multilabel_predict_proba(estimator):
|
||||
"""Check the behaviour for the multilabel classification case and the
|
||||
`predict_proba` stacking method.
|
||||
|
||||
Estimators are not consistent with the output arrays and we need to ensure that
|
||||
we handle all cases.
|
||||
"""
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
|
||||
)
|
||||
n_outputs = 3
|
||||
|
||||
estimators = [("est", estimator)]
|
||||
stacker = StackingClassifier(
|
||||
estimators=estimators,
|
||||
final_estimator=KNeighborsClassifier(),
|
||||
stack_method="predict_proba",
|
||||
).fit(X_train, y_train)
|
||||
|
||||
X_trans = stacker.transform(X_test)
|
||||
assert X_trans.shape == (X_test.shape[0], n_outputs)
|
||||
# we should not have any collinear classes and thus nothing should sum to 1
|
||||
assert not any(np.isclose(X_trans.sum(axis=1), 1.0))
|
||||
|
||||
y_pred = stacker.predict(X_test)
|
||||
assert y_pred.shape == y_test.shape
|
||||
|
||||
|
||||
def test_stacking_classifier_multilabel_decision_function():
|
||||
"""Check the behaviour for the multilabel classification case and the
|
||||
`decision_function` stacking method. Only `RidgeClassifier` supports this
|
||||
case.
|
||||
"""
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
|
||||
)
|
||||
n_outputs = 3
|
||||
|
||||
estimators = [("est", RidgeClassifier())]
|
||||
stacker = StackingClassifier(
|
||||
estimators=estimators,
|
||||
final_estimator=KNeighborsClassifier(),
|
||||
stack_method="decision_function",
|
||||
).fit(X_train, y_train)
|
||||
|
||||
X_trans = stacker.transform(X_test)
|
||||
assert X_trans.shape == (X_test.shape[0], n_outputs)
|
||||
|
||||
y_pred = stacker.predict(X_test)
|
||||
assert y_pred.shape == y_test.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize("stack_method", ["auto", "predict"])
|
||||
@pytest.mark.parametrize("passthrough", [False, True])
|
||||
def test_stacking_classifier_multilabel_auto_predict(stack_method, passthrough):
|
||||
"""Check the behaviour for the multilabel classification case for stack methods
|
||||
supported for all estimators or automatically picked up.
|
||||
"""
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
|
||||
)
|
||||
y_train_before_fit = y_train.copy()
|
||||
n_outputs = 3
|
||||
|
||||
estimators = [
|
||||
("mlp", MLPClassifier(random_state=42)),
|
||||
("rf", RandomForestClassifier(random_state=42)),
|
||||
("ridge", RidgeClassifier()),
|
||||
]
|
||||
final_estimator = KNeighborsClassifier()
|
||||
|
||||
clf = StackingClassifier(
|
||||
estimators=estimators,
|
||||
final_estimator=final_estimator,
|
||||
passthrough=passthrough,
|
||||
stack_method=stack_method,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# make sure we don't change `y_train` inplace
|
||||
assert_array_equal(y_train_before_fit, y_train)
|
||||
|
||||
y_pred = clf.predict(X_test)
|
||||
assert y_pred.shape == y_test.shape
|
||||
|
||||
if stack_method == "auto":
|
||||
expected_stack_methods = ["predict_proba", "predict_proba", "decision_function"]
|
||||
else:
|
||||
expected_stack_methods = ["predict"] * len(estimators)
|
||||
assert clf.stack_method_ == expected_stack_methods
|
||||
|
||||
n_features_X_trans = n_outputs * len(estimators)
|
||||
if passthrough:
|
||||
n_features_X_trans += X_train.shape[1]
|
||||
X_trans = clf.transform(X_test)
|
||||
assert X_trans.shape == (X_test.shape[0], n_features_X_trans)
|
||||
|
||||
assert_array_equal(clf.classes_, [np.array([0, 1])] * n_outputs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, feature_names, X, y, expected_names",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("svm", LinearSVC(random_state=0)),
|
||||
]
|
||||
),
|
||||
iris.feature_names,
|
||||
X_iris,
|
||||
y_iris,
|
||||
[
|
||||
"stackingclassifier_lr0",
|
||||
"stackingclassifier_lr1",
|
||||
"stackingclassifier_lr2",
|
||||
"stackingclassifier_svm0",
|
||||
"stackingclassifier_svm1",
|
||||
"stackingclassifier_svm2",
|
||||
],
|
||||
),
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("other", "drop"),
|
||||
("svm", LinearSVC(random_state=0)),
|
||||
]
|
||||
),
|
||||
iris.feature_names,
|
||||
X_iris[:100],
|
||||
y_iris[:100], # keep only classes 0 and 1
|
||||
[
|
||||
"stackingclassifier_lr",
|
||||
"stackingclassifier_svm",
|
||||
],
|
||||
),
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR(random_state=0)),
|
||||
]
|
||||
),
|
||||
diabetes.feature_names,
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
[
|
||||
"stackingregressor_lr",
|
||||
"stackingregressor_svm",
|
||||
],
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"StackingClassifier_multiclass",
|
||||
"StackingClassifier_binary",
|
||||
"StackingRegressor",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("passthrough", [True, False])
|
||||
def test_get_feature_names_out(
|
||||
stacker, feature_names, X, y, expected_names, passthrough
|
||||
):
|
||||
"""Check get_feature_names_out works for stacking."""
|
||||
|
||||
stacker.set_params(passthrough=passthrough)
|
||||
stacker.fit(scale(X), y)
|
||||
|
||||
if passthrough:
|
||||
expected_names = np.concatenate((expected_names, feature_names))
|
||||
|
||||
names_out = stacker.get_feature_names_out(feature_names)
|
||||
assert_array_equal(names_out, expected_names)
|
||||
|
||||
|
||||
def test_stacking_classifier_base_regressor():
|
||||
"""Check that a regressor can be used as the first layer in `StackingClassifier`."""
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
scale(X_iris), y_iris, stratify=y_iris, random_state=42
|
||||
)
|
||||
clf = StackingClassifier(estimators=[("ridge", Ridge())])
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
clf.predict_proba(X_test)
|
||||
assert clf.score(X_test, y_test) > 0.8
|
||||
|
||||
|
||||
def test_stacking_final_estimator_attribute_error():
|
||||
"""Check that we raise the proper AttributeError when the final estimator
|
||||
does not implement the `decision_function` method, which is decorated with
|
||||
`available_if`.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/28108
|
||||
"""
|
||||
X, y = make_classification(random_state=42)
|
||||
|
||||
estimators = [
|
||||
("lr", LogisticRegression()),
|
||||
("rf", RandomForestClassifier(n_estimators=2, random_state=42)),
|
||||
]
|
||||
# RandomForestClassifier does not implement 'decision_function' and should raise
|
||||
# an AttributeError
|
||||
final_estimator = RandomForestClassifier(n_estimators=2, random_state=42)
|
||||
clf = StackingClassifier(
|
||||
estimators=estimators, final_estimator=final_estimator, cv=3
|
||||
)
|
||||
|
||||
outer_msg = "This 'StackingClassifier' has no attribute 'decision_function'"
|
||||
inner_msg = "'RandomForestClassifier' object has no attribute 'decision_function'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
clf.fit(X, y).decision_function(X)
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
@ -0,0 +1,788 @@
|
||||
"""Testing for the VotingClassifier and VotingRegressor"""
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin, clone
|
||||
from sklearn.datasets import make_multilabel_classification
|
||||
from sklearn.dummy import DummyRegressor
|
||||
from sklearn.ensemble import (
|
||||
RandomForestClassifier,
|
||||
RandomForestRegressor,
|
||||
VotingClassifier,
|
||||
VotingRegressor,
|
||||
)
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import LinearRegression, LogisticRegression
|
||||
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tests.metadata_routing_common import (
|
||||
ConsumingClassifier,
|
||||
ConsumingRegressor,
|
||||
_Registry,
|
||||
check_recorded_metadata,
|
||||
)
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
|
||||
# Load datasets
|
||||
iris = datasets.load_iris()
|
||||
X, y = iris.data[:, 1:3], iris.target
|
||||
# Scaled to solve ConvergenceWarning throw by Logistic Regression
|
||||
X_scaled = StandardScaler().fit_transform(X)
|
||||
|
||||
X_r, y_r = datasets.load_diabetes(return_X_y=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
(
|
||||
{"estimators": []},
|
||||
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
|
||||
),
|
||||
(
|
||||
{"estimators": [("lr", LogisticRegression())], "weights": [1, 2]},
|
||||
"Number of `estimators` and weights must be equal",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_voting_classifier_estimator_init(params, err_msg):
|
||||
ensemble = VotingClassifier(**params)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
def test_predictproba_hardvoting():
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
|
||||
voting="hard",
|
||||
)
|
||||
|
||||
inner_msg = "predict_proba is not available when voting='hard'"
|
||||
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
eclf.predict_proba
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
assert not hasattr(eclf, "predict_proba")
|
||||
eclf.fit(X_scaled, y)
|
||||
assert not hasattr(eclf, "predict_proba")
|
||||
|
||||
|
||||
def test_notfitted():
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
|
||||
voting="soft",
|
||||
)
|
||||
ereg = VotingRegressor([("dr", DummyRegressor())])
|
||||
msg = (
|
||||
"This %s instance is not fitted yet. Call 'fit'"
|
||||
" with appropriate arguments before using this estimator."
|
||||
)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
|
||||
eclf.predict(X)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
|
||||
eclf.predict_proba(X)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
|
||||
eclf.transform(X)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
|
||||
ereg.predict(X_r)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
|
||||
ereg.transform(X_r)
|
||||
|
||||
|
||||
def test_majority_label_iris(global_random_seed):
|
||||
"""Check classification by majority label on dataset iris."""
|
||||
clf1 = LogisticRegression(solver="liblinear", random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
|
||||
)
|
||||
scores = cross_val_score(eclf, X, y, scoring="accuracy")
|
||||
|
||||
assert scores.mean() >= 0.9
|
||||
|
||||
|
||||
def test_tie_situation():
|
||||
"""Check voting classifier selects smaller class label in tie situation."""
|
||||
clf1 = LogisticRegression(random_state=123, solver="liblinear")
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
|
||||
assert clf1.fit(X, y).predict(X)[73] == 2
|
||||
assert clf2.fit(X, y).predict(X)[73] == 1
|
||||
assert eclf.fit(X, y).predict(X)[73] == 1
|
||||
|
||||
|
||||
def test_weights_iris(global_random_seed):
|
||||
"""Check classification by average probabilities on dataset iris."""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
weights=[1, 2, 10],
|
||||
)
|
||||
scores = cross_val_score(eclf, X_scaled, y, scoring="accuracy")
|
||||
assert scores.mean() >= 0.9
|
||||
|
||||
|
||||
def test_weights_regressor():
|
||||
"""Check weighted average regression prediction on diabetes dataset."""
|
||||
reg1 = DummyRegressor(strategy="mean")
|
||||
reg2 = DummyRegressor(strategy="median")
|
||||
reg3 = DummyRegressor(strategy="quantile", quantile=0.2)
|
||||
ereg = VotingRegressor(
|
||||
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10]
|
||||
)
|
||||
|
||||
X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
|
||||
X_r, y_r, test_size=0.25
|
||||
)
|
||||
|
||||
reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
|
||||
avg = np.average(
|
||||
np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]
|
||||
)
|
||||
assert_almost_equal(ereg_pred, avg, decimal=2)
|
||||
|
||||
ereg_weights_none = VotingRegressor(
|
||||
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None
|
||||
)
|
||||
ereg_weights_equal = VotingRegressor(
|
||||
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1]
|
||||
)
|
||||
ereg_weights_none.fit(X_r_train, y_r_train)
|
||||
ereg_weights_equal.fit(X_r_train, y_r_train)
|
||||
ereg_none_pred = ereg_weights_none.predict(X_r_test)
|
||||
ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
|
||||
assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
|
||||
|
||||
|
||||
def test_predict_on_toy_problem(global_random_seed):
|
||||
"""Manually check predicted class labels for toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
|
||||
X = np.array(
|
||||
[[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]
|
||||
)
|
||||
|
||||
y = np.array([1, 1, 1, 2, 2, 2])
|
||||
|
||||
assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="hard",
|
||||
weights=[1, 1, 1],
|
||||
)
|
||||
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
weights=[1, 1, 1],
|
||||
)
|
||||
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
|
||||
def test_predict_proba_on_toy_problem():
|
||||
"""Calculate predicted probabilities on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
clf1_res = np.array(
|
||||
[
|
||||
[0.59790391, 0.40209609],
|
||||
[0.57622162, 0.42377838],
|
||||
[0.50728456, 0.49271544],
|
||||
[0.40241774, 0.59758226],
|
||||
]
|
||||
)
|
||||
|
||||
clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]])
|
||||
|
||||
clf3_res = np.array(
|
||||
[[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0.0, 1.0], [0.0, 1.0]]
|
||||
)
|
||||
|
||||
t00 = (2 * clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
|
||||
t11 = (2 * clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
|
||||
t21 = (2 * clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
|
||||
t31 = (2 * clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
|
||||
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
weights=[2, 1, 1],
|
||||
)
|
||||
eclf_res = eclf.fit(X, y).predict_proba(X)
|
||||
|
||||
assert_almost_equal(t00, eclf_res[0][0], decimal=1)
|
||||
assert_almost_equal(t11, eclf_res[1][1], decimal=1)
|
||||
assert_almost_equal(t21, eclf_res[2][1], decimal=1)
|
||||
assert_almost_equal(t31, eclf_res[3][1], decimal=1)
|
||||
|
||||
inner_msg = "predict_proba is not available when voting='hard'"
|
||||
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
|
||||
)
|
||||
eclf.fit(X, y).predict_proba(X)
|
||||
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
|
||||
def test_multilabel():
|
||||
"""Check if error is raised for multilabel classification."""
|
||||
X, y = make_multilabel_classification(
|
||||
n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123
|
||||
)
|
||||
clf = OneVsRestClassifier(SVC(kernel="linear"))
|
||||
|
||||
eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")
|
||||
|
||||
try:
|
||||
eclf.fit(X, y)
|
||||
except NotImplementedError:
|
||||
return
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
"""Check GridSearch support."""
|
||||
clf1 = LogisticRegression(random_state=1)
|
||||
clf2 = RandomForestClassifier(random_state=1, n_estimators=3)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
|
||||
)
|
||||
|
||||
params = {
|
||||
"lr__C": [1.0, 100.0],
|
||||
"voting": ["soft", "hard"],
|
||||
"weights": [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]],
|
||||
}
|
||||
|
||||
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2)
|
||||
grid.fit(X_scaled, y)
|
||||
|
||||
|
||||
def test_parallel_fit(global_random_seed):
|
||||
"""Check parallel backend of VotingClassifier on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1
|
||||
).fit(X, y)
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2
|
||||
).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
|
||||
|
||||
@ignore_warnings(category=FutureWarning)
|
||||
def test_sample_weight(global_random_seed):
|
||||
"""Tests sample_weight parameter of VotingClassifier"""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = SVC(probability=True, random_state=global_random_seed)
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
|
||||
).fit(X_scaled, y, sample_weight=np.ones((len(y),)))
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
|
||||
).fit(X_scaled, y)
|
||||
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
|
||||
assert_array_almost_equal(
|
||||
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
|
||||
)
|
||||
sample_weight = np.random.RandomState(global_random_seed).uniform(size=(len(y),))
|
||||
eclf3 = VotingClassifier(estimators=[("lr", clf1)], voting="soft")
|
||||
eclf3.fit(X_scaled, y, sample_weight)
|
||||
clf1.fit(X_scaled, y, sample_weight)
|
||||
assert_array_equal(eclf3.predict(X_scaled), clf1.predict(X_scaled))
|
||||
assert_array_almost_equal(
|
||||
eclf3.predict_proba(X_scaled), clf1.predict_proba(X_scaled)
|
||||
)
|
||||
|
||||
# check that an error is raised and indicative if sample_weight is not
|
||||
# supported.
|
||||
clf4 = KNeighborsClassifier()
|
||||
eclf3 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("svc", clf3), ("knn", clf4)], voting="soft"
|
||||
)
|
||||
msg = "Underlying estimator KNeighborsClassifier does not support sample weights."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
eclf3.fit(X_scaled, y, sample_weight)
|
||||
|
||||
# check that _fit_single_estimator will raise the right error
|
||||
# it should raise the original error if this is not linked to sample_weight
|
||||
class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
|
||||
def fit(self, X_scaled, y, sample_weight):
|
||||
raise TypeError("Error unrelated to sample_weight.")
|
||||
|
||||
clf = ClassifierErrorFit()
|
||||
with pytest.raises(TypeError, match="Error unrelated to sample_weight"):
|
||||
clf.fit(X_scaled, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
def test_sample_weight_kwargs():
|
||||
"""Check that VotingClassifier passes sample_weight as kwargs"""
|
||||
|
||||
class MockClassifier(ClassifierMixin, BaseEstimator):
|
||||
"""Mock Classifier to check that sample_weight is received as kwargs"""
|
||||
|
||||
def fit(self, X, y, *args, **sample_weight):
|
||||
assert "sample_weight" in sample_weight
|
||||
|
||||
clf = MockClassifier()
|
||||
eclf = VotingClassifier(estimators=[("mock", clf)], voting="soft")
|
||||
|
||||
# Should not raise an error.
|
||||
eclf.fit(X, y, sample_weight=np.ones((len(y),)))
|
||||
|
||||
|
||||
def test_voting_classifier_set_params(global_random_seed):
|
||||
# check equivalence in the output when setting underlying estimators
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(
|
||||
n_estimators=10, random_state=global_random_seed, max_depth=None
|
||||
)
|
||||
clf3 = GaussianNB()
|
||||
|
||||
eclf1 = VotingClassifier(
|
||||
[("lr", clf1), ("rf", clf2)], voting="soft", weights=[1, 2]
|
||||
).fit(X_scaled, y)
|
||||
eclf2 = VotingClassifier(
|
||||
[("lr", clf1), ("nb", clf3)], voting="soft", weights=[1, 2]
|
||||
)
|
||||
eclf2.set_params(nb=clf2).fit(X_scaled, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
|
||||
assert_array_almost_equal(
|
||||
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
|
||||
)
|
||||
assert eclf2.estimators[0][1].get_params() == clf1.get_params()
|
||||
assert eclf2.estimators[1][1].get_params() == clf2.get_params()
|
||||
|
||||
|
||||
def test_set_estimator_drop():
|
||||
# VotingClassifier set_params should be able to set estimators as drop
|
||||
# Test predict
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
|
||||
voting="hard",
|
||||
weights=[1, 0, 0.5],
|
||||
).fit(X, y)
|
||||
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
|
||||
voting="hard",
|
||||
weights=[1, 1, 0.5],
|
||||
)
|
||||
eclf2.set_params(rf="drop").fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
|
||||
assert dict(eclf2.estimators)["rf"] == "drop"
|
||||
assert len(eclf2.estimators_) == 2
|
||||
assert all(
|
||||
isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_
|
||||
)
|
||||
assert eclf2.get_params()["rf"] == "drop"
|
||||
|
||||
eclf1.set_params(voting="soft").fit(X, y)
|
||||
eclf2.set_params(voting="soft").fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
msg = "All estimators are dropped. At least one is required"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)
|
||||
|
||||
# Test soft voting transform
|
||||
X1 = np.array([[1], [2]])
|
||||
y1 = np.array([1, 2])
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("rf", clf2), ("nb", clf3)],
|
||||
voting="soft",
|
||||
weights=[0, 0.5],
|
||||
flatten_transform=False,
|
||||
).fit(X1, y1)
|
||||
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("rf", clf2), ("nb", clf3)],
|
||||
voting="soft",
|
||||
weights=[1, 0.5],
|
||||
flatten_transform=False,
|
||||
)
|
||||
eclf2.set_params(rf="drop").fit(X1, y1)
|
||||
assert_array_almost_equal(
|
||||
eclf1.transform(X1),
|
||||
np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
|
||||
)
|
||||
assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]]))
|
||||
eclf1.set_params(voting="hard")
|
||||
eclf2.set_params(voting="hard")
|
||||
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
|
||||
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
|
||||
|
||||
|
||||
def test_estimator_weights_format(global_random_seed):
|
||||
# Test estimator weights inputs as list and array
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft"
|
||||
)
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft"
|
||||
)
|
||||
eclf1.fit(X_scaled, y)
|
||||
eclf2.fit(X_scaled, y)
|
||||
assert_array_almost_equal(
|
||||
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
|
||||
)
|
||||
|
||||
|
||||
def test_transform(global_random_seed):
|
||||
"""Check transform method of VotingClassifier on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
|
||||
).fit(X, y)
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
flatten_transform=True,
|
||||
).fit(X, y)
|
||||
eclf3 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
flatten_transform=False,
|
||||
).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.transform(X).shape, (4, 6))
|
||||
assert_array_equal(eclf2.transform(X).shape, (4, 6))
|
||||
assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
|
||||
assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
|
||||
assert_array_almost_equal(
|
||||
eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, voter",
|
||||
[
|
||||
(
|
||||
X,
|
||||
y,
|
||||
VotingClassifier(
|
||||
[
|
||||
("lr", LogisticRegression()),
|
||||
("rf", RandomForestClassifier(n_estimators=5)),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
X_r,
|
||||
y_r,
|
||||
VotingRegressor(
|
||||
[
|
||||
("lr", LinearRegression()),
|
||||
("rf", RandomForestRegressor(n_estimators=5)),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_none_estimator_with_weights(X, y, voter):
|
||||
# check that an estimator can be set to 'drop' and passing some weight
|
||||
# regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/13777
|
||||
voter = clone(voter)
|
||||
# Scaled to solve ConvergenceWarning throw by Logistic Regression
|
||||
X_scaled = StandardScaler().fit_transform(X)
|
||||
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
|
||||
voter.set_params(lr="drop")
|
||||
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
|
||||
y_pred = voter.predict(X_scaled)
|
||||
assert y_pred.shape == y.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est",
|
||||
[
|
||||
VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("tree", DecisionTreeRegressor(random_state=0)),
|
||||
]
|
||||
),
|
||||
VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("tree", DecisionTreeClassifier(random_state=0)),
|
||||
]
|
||||
),
|
||||
],
|
||||
ids=["VotingRegressor", "VotingClassifier"],
|
||||
)
|
||||
def test_n_features_in(est):
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
assert not hasattr(est, "n_features_in_")
|
||||
est.fit(X, y)
|
||||
assert est.n_features_in_ == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("rf", RandomForestRegressor(random_state=123)),
|
||||
],
|
||||
verbose=True,
|
||||
),
|
||||
VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=123)),
|
||||
("rf", RandomForestClassifier(random_state=123)),
|
||||
],
|
||||
verbose=True,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_voting_verbose(estimator, capsys):
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
pattern = (
|
||||
r"\[Voting\].*\(1 of 2\) Processing lr, total=.*\n"
|
||||
r"\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$"
|
||||
)
|
||||
|
||||
estimator.fit(X, y)
|
||||
assert re.match(pattern, capsys.readouterr()[0])
|
||||
|
||||
|
||||
def test_get_features_names_out_regressor():
|
||||
"""Check get_feature_names_out output for regressor."""
|
||||
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
voting = VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("tree", DecisionTreeRegressor(random_state=0)),
|
||||
("ignore", "drop"),
|
||||
]
|
||||
)
|
||||
voting.fit(X, y)
|
||||
|
||||
names_out = voting.get_feature_names_out()
|
||||
expected_names = ["votingregressor_lr", "votingregressor_tree"]
|
||||
assert_array_equal(names_out, expected_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs, expected_names",
|
||||
[
|
||||
(
|
||||
{"voting": "soft", "flatten_transform": True},
|
||||
[
|
||||
"votingclassifier_lr0",
|
||||
"votingclassifier_lr1",
|
||||
"votingclassifier_lr2",
|
||||
"votingclassifier_tree0",
|
||||
"votingclassifier_tree1",
|
||||
"votingclassifier_tree2",
|
||||
],
|
||||
),
|
||||
({"voting": "hard"}, ["votingclassifier_lr", "votingclassifier_tree"]),
|
||||
],
|
||||
)
|
||||
def test_get_features_names_out_classifier(kwargs, expected_names):
|
||||
"""Check get_feature_names_out for classifier for different settings."""
|
||||
X = [[1, 2], [3, 4], [5, 6], [1, 1.2]]
|
||||
y = [0, 1, 2, 0]
|
||||
|
||||
voting = VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("tree", DecisionTreeClassifier(random_state=0)),
|
||||
],
|
||||
**kwargs,
|
||||
)
|
||||
voting.fit(X, y)
|
||||
X_trans = voting.transform(X)
|
||||
names_out = voting.get_feature_names_out()
|
||||
|
||||
assert X_trans.shape[1] == len(expected_names)
|
||||
assert_array_equal(names_out, expected_names)
|
||||
|
||||
|
||||
def test_get_features_names_out_classifier_error():
|
||||
"""Check that error is raised when voting="soft" and flatten_transform=False."""
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
voting = VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("tree", DecisionTreeClassifier(random_state=0)),
|
||||
],
|
||||
voting="soft",
|
||||
flatten_transform=False,
|
||||
)
|
||||
voting.fit(X, y)
|
||||
|
||||
msg = (
|
||||
"get_feature_names_out is not supported when `voting='soft'` and "
|
||||
"`flatten_transform=False`"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
voting.get_feature_names_out()
|
||||
|
||||
|
||||
# Metadata Routing Tests
|
||||
# ======================
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator, Child",
|
||||
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
|
||||
)
|
||||
def test_routing_passed_metadata_not_supported(Estimator, Child):
|
||||
"""Test that the right error message is raised when metadata is passed while
|
||||
not supported when `enable_metadata_routing=False`."""
|
||||
|
||||
X = np.array([[0, 1], [2, 2], [4, 6]])
|
||||
y = [1, 2, 3]
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="is only supported if enable_metadata_routing=True"
|
||||
):
|
||||
Estimator(["clf", Child()]).fit(X, y, sample_weight=[1, 1, 1], metadata="a")
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("enable_slep006")
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator, Child",
|
||||
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
|
||||
)
|
||||
def test_get_metadata_routing_without_fit(Estimator, Child):
|
||||
# Test that metadata_routing() doesn't raise when called before fit.
|
||||
est = Estimator([("sub_est", Child())])
|
||||
est.get_metadata_routing()
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("enable_slep006")
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator, Child",
|
||||
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
|
||||
)
|
||||
@pytest.mark.parametrize("prop", ["sample_weight", "metadata"])
|
||||
def test_metadata_routing_for_voting_estimators(Estimator, Child, prop):
|
||||
"""Test that metadata is routed correctly for Voting*."""
|
||||
X = np.array([[0, 1], [2, 2], [4, 6]])
|
||||
y = [1, 2, 3]
|
||||
sample_weight, metadata = [1, 1, 1], "a"
|
||||
|
||||
est = Estimator(
|
||||
[
|
||||
(
|
||||
"sub_est1",
|
||||
Child(registry=_Registry()).set_fit_request(**{prop: True}),
|
||||
),
|
||||
(
|
||||
"sub_est2",
|
||||
Child(registry=_Registry()).set_fit_request(**{prop: True}),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata})
|
||||
|
||||
for estimator in est.estimators:
|
||||
if prop == "sample_weight":
|
||||
kwargs = {prop: sample_weight}
|
||||
else:
|
||||
kwargs = {prop: metadata}
|
||||
# access sub-estimator in (name, est) with estimator[1]
|
||||
registry = estimator[1].registry
|
||||
assert len(registry)
|
||||
for sub_est in registry:
|
||||
check_recorded_metadata(obj=sub_est, method="fit", **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("enable_slep006")
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator, Child",
|
||||
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
|
||||
)
|
||||
def test_metadata_routing_error_for_voting_estimators(Estimator, Child):
|
||||
"""Test that the right error is raised when metadata is not requested."""
|
||||
X = np.array([[0, 1], [2, 2], [4, 6]])
|
||||
y = [1, 2, 3]
|
||||
sample_weight, metadata = [1, 1, 1], "a"
|
||||
|
||||
est = Estimator([("sub_est", Child())])
|
||||
|
||||
error_message = (
|
||||
"[sample_weight, metadata] are passed but are not explicitly set as requested"
|
||||
f" or not requested for {Child.__name__}.fit"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=re.escape(error_message)):
|
||||
est.fit(X, y, sample_weight=sample_weight, metadata=metadata)
|
||||
|
||||
|
||||
# End of Metadata Routing Tests
|
||||
# =============================
|
||||
705
venv/lib/python3.11/site-packages/sklearn/ensemble/tests/test_weight_boosting.py
Executable file
705
venv/lib/python3.11/site-packages/sklearn/ensemble/tests/test_weight_boosting.py
Executable file
@ -0,0 +1,705 @@
|
||||
"""Testing for the boost module (sklearn.ensemble.boost)."""
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.base import BaseEstimator, clone
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
|
||||
from sklearn.ensemble._weight_boosting import _samme_proba
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.model_selection import GridSearchCV, train_test_split
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils._mocking import NoSampleWeightWrapper
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
assert_array_less,
|
||||
)
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
|
||||
# Common random state
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# Toy sample
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||||
y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
|
||||
y_regr = [-1, -1, -1, 1, 1, 1]
|
||||
T = [[-1, -1], [2, 2], [3, 2]]
|
||||
y_t_class = ["foo", 1, 1]
|
||||
y_t_regr = [-1, 1, 1]
|
||||
|
||||
# Load the iris dataset and randomly permute it
|
||||
iris = datasets.load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
|
||||
|
||||
# Load the diabetes dataset and randomly permute it
|
||||
diabetes = datasets.load_diabetes()
|
||||
diabetes.data, diabetes.target = shuffle(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
|
||||
def test_samme_proba():
|
||||
# Test the `_samme_proba` helper function.
|
||||
|
||||
# Define some example (bad) `predict_proba` output.
|
||||
probs = np.array(
|
||||
[[1, 1e-6, 0], [0.19, 0.6, 0.2], [-999, 0.51, 0.5], [1e-6, 1, 1e-9]]
|
||||
)
|
||||
probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
|
||||
|
||||
# _samme_proba calls estimator.predict_proba.
|
||||
# Make a mock object so I can control what gets returned.
|
||||
class MockEstimator:
|
||||
def predict_proba(self, X):
|
||||
assert_array_equal(X.shape, probs.shape)
|
||||
return probs
|
||||
|
||||
mock = MockEstimator()
|
||||
|
||||
samme_proba = _samme_proba(mock, 3, np.ones_like(probs))
|
||||
|
||||
assert_array_equal(samme_proba.shape, probs.shape)
|
||||
assert np.isfinite(samme_proba).all()
|
||||
|
||||
# Make sure that the correct elements come out as smallest --
|
||||
# `_samme_proba` should preserve the ordering in each example.
|
||||
assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2])
|
||||
assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1])
|
||||
|
||||
|
||||
def test_oneclass_adaboost_proba():
|
||||
# Test predict_proba robustness for one class label input.
|
||||
# In response to issue #7501
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/7501
|
||||
y_t = np.ones(len(X))
|
||||
clf = AdaBoostClassifier(algorithm="SAMME").fit(X, y_t)
|
||||
assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
|
||||
|
||||
|
||||
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||||
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||||
# only consider "SAMME"
|
||||
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_classification_toy(algorithm):
|
||||
# Check classification on a toy dataset.
|
||||
clf = AdaBoostClassifier(algorithm=algorithm, random_state=0)
|
||||
clf.fit(X, y_class)
|
||||
assert_array_equal(clf.predict(T), y_t_class)
|
||||
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
|
||||
assert clf.predict_proba(T).shape == (len(T), 2)
|
||||
assert clf.decision_function(T).shape == (len(T),)
|
||||
|
||||
|
||||
def test_regression_toy():
|
||||
# Check classification on a toy dataset.
|
||||
clf = AdaBoostRegressor(random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
assert_array_equal(clf.predict(T), y_t_regr)
|
||||
|
||||
|
||||
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||||
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||||
# only consider "SAMME"
|
||||
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||||
def test_iris():
|
||||
# Check consistency on dataset iris.
|
||||
classes = np.unique(iris.target)
|
||||
clf_samme = prob_samme = None
|
||||
|
||||
for alg in ["SAMME", "SAMME.R"]:
|
||||
clf = AdaBoostClassifier(algorithm=alg)
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
assert_array_equal(classes, clf.classes_)
|
||||
proba = clf.predict_proba(iris.data)
|
||||
if alg == "SAMME":
|
||||
clf_samme = clf
|
||||
prob_samme = proba
|
||||
assert proba.shape[1] == len(classes)
|
||||
assert clf.decision_function(iris.data).shape[1] == len(classes)
|
||||
|
||||
score = clf.score(iris.data, iris.target)
|
||||
assert score > 0.9, "Failed with algorithm %s and score = %f" % (alg, score)
|
||||
|
||||
# Check we used multiple estimators
|
||||
assert len(clf.estimators_) > 1
|
||||
# Check for distinct random states (see issue #7408)
|
||||
assert len(set(est.random_state for est in clf.estimators_)) == len(
|
||||
clf.estimators_
|
||||
)
|
||||
|
||||
# Somewhat hacky regression test: prior to
|
||||
# ae7adc880d624615a34bafdb1d75ef67051b8200,
|
||||
# predict_proba returned SAMME.R values for SAMME.
|
||||
clf_samme.algorithm = "SAMME.R"
|
||||
assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("loss", ["linear", "square", "exponential"])
|
||||
def test_diabetes(loss):
|
||||
# Check consistency on dataset diabetes.
|
||||
reg = AdaBoostRegressor(loss=loss, random_state=0)
|
||||
reg.fit(diabetes.data, diabetes.target)
|
||||
score = reg.score(diabetes.data, diabetes.target)
|
||||
assert score > 0.55
|
||||
|
||||
# Check we used multiple estimators
|
||||
assert len(reg.estimators_) > 1
|
||||
# Check for distinct random states (see issue #7408)
|
||||
assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_)
|
||||
|
||||
|
||||
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||||
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||||
# only consider "SAMME"
|
||||
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_staged_predict(algorithm):
|
||||
# Check staged predictions.
|
||||
rng = np.random.RandomState(0)
|
||||
iris_weights = rng.randint(10, size=iris.target.shape)
|
||||
diabetes_weights = rng.randint(10, size=diabetes.target.shape)
|
||||
|
||||
clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10)
|
||||
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
|
||||
|
||||
predictions = clf.predict(iris.data)
|
||||
staged_predictions = [p for p in clf.staged_predict(iris.data)]
|
||||
proba = clf.predict_proba(iris.data)
|
||||
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
|
||||
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
|
||||
staged_scores = [
|
||||
s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights)
|
||||
]
|
||||
|
||||
assert len(staged_predictions) == 10
|
||||
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||||
assert len(staged_probas) == 10
|
||||
assert_array_almost_equal(proba, staged_probas[-1])
|
||||
assert len(staged_scores) == 10
|
||||
assert_array_almost_equal(score, staged_scores[-1])
|
||||
|
||||
# AdaBoost regression
|
||||
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
|
||||
clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
|
||||
|
||||
predictions = clf.predict(diabetes.data)
|
||||
staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
|
||||
score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
|
||||
staged_scores = [
|
||||
s
|
||||
for s in clf.staged_score(
|
||||
diabetes.data, diabetes.target, sample_weight=diabetes_weights
|
||||
)
|
||||
]
|
||||
|
||||
assert len(staged_predictions) == 10
|
||||
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||||
assert len(staged_scores) == 10
|
||||
assert_array_almost_equal(score, staged_scores[-1])
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
# Check that base trees can be grid-searched.
|
||||
# AdaBoost classification
|
||||
boost = AdaBoostClassifier(estimator=DecisionTreeClassifier())
|
||||
parameters = {
|
||||
"n_estimators": (1, 2),
|
||||
"estimator__max_depth": (1, 2),
|
||||
"algorithm": ("SAMME", "SAMME.R"),
|
||||
}
|
||||
clf = GridSearchCV(boost, parameters)
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
# AdaBoost regression
|
||||
boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0)
|
||||
parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)}
|
||||
clf = GridSearchCV(boost, parameters)
|
||||
clf.fit(diabetes.data, diabetes.target)
|
||||
|
||||
|
||||
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||||
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||||
# only consider "SAMME"
|
||||
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||||
def test_pickle():
|
||||
# Check pickability.
|
||||
import pickle
|
||||
|
||||
# Adaboost classifier
|
||||
for alg in ["SAMME", "SAMME.R"]:
|
||||
obj = AdaBoostClassifier(algorithm=alg)
|
||||
obj.fit(iris.data, iris.target)
|
||||
score = obj.score(iris.data, iris.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(iris.data, iris.target)
|
||||
assert score == score2
|
||||
|
||||
# Adaboost regressor
|
||||
obj = AdaBoostRegressor(random_state=0)
|
||||
obj.fit(diabetes.data, diabetes.target)
|
||||
score = obj.score(diabetes.data, diabetes.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(diabetes.data, diabetes.target)
|
||||
assert score == score2
|
||||
|
||||
|
||||
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||||
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||||
# only consider "SAMME"
|
||||
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||||
def test_importances():
|
||||
# Check variable importances.
|
||||
X, y = datasets.make_classification(
|
||||
n_samples=2000,
|
||||
n_features=10,
|
||||
n_informative=3,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
shuffle=False,
|
||||
random_state=1,
|
||||
)
|
||||
|
||||
for alg in ["SAMME", "SAMME.R"]:
|
||||
clf = AdaBoostClassifier(algorithm=alg)
|
||||
|
||||
clf.fit(X, y)
|
||||
importances = clf.feature_importances_
|
||||
|
||||
assert importances.shape[0] == 10
|
||||
assert (importances[:3, np.newaxis] >= importances[3:]).all()
|
||||
|
||||
|
||||
def test_adaboost_classifier_sample_weight_error():
|
||||
# Test that it gives proper exception on incorrect sample weight.
|
||||
clf = AdaBoostClassifier()
|
||||
msg = re.escape("sample_weight.shape == (1,), expected (6,)")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clf.fit(X, y_class, sample_weight=np.asarray([-1]))
|
||||
|
||||
|
||||
def test_estimator():
|
||||
# Test different estimators.
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
# XXX doesn't work with y_class because RF doesn't support classes_
|
||||
# Shouldn't AdaBoost run a LabelBinarizer?
|
||||
clf = AdaBoostClassifier(RandomForestClassifier(), algorithm="SAMME")
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
|
||||
clf.fit(X, y_class)
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
|
||||
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
clf = AdaBoostRegressor(SVR(), random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
# Check that an empty discrete ensemble fails in fit, not predict.
|
||||
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
|
||||
y_fail = ["foo", "bar", 1, 2]
|
||||
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
|
||||
with pytest.raises(ValueError, match="worse than random"):
|
||||
clf.fit(X_fail, y_fail)
|
||||
|
||||
|
||||
def test_sample_weights_infinite():
|
||||
msg = "Sample weights have reached infinite values"
|
||||
clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0, algorithm="SAMME")
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container, expected_internal_type",
|
||||
zip(
|
||||
[
|
||||
*CSC_CONTAINERS,
|
||||
*CSR_CONTAINERS,
|
||||
*LIL_CONTAINERS,
|
||||
*COO_CONTAINERS,
|
||||
*DOK_CONTAINERS,
|
||||
],
|
||||
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
|
||||
),
|
||||
)
|
||||
def test_sparse_classification(sparse_container, expected_internal_type):
|
||||
# Check classification with sparse input.
|
||||
|
||||
class CustomSVC(SVC):
|
||||
"""SVC variant that records the nature of the training set."""
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Modification on fit caries data type for later verification."""
|
||||
super().fit(X, y, sample_weight=sample_weight)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
X, y = datasets.make_multilabel_classification(
|
||||
n_classes=1, n_samples=15, n_features=5, random_state=42
|
||||
)
|
||||
# Flatten y to a 1d array
|
||||
y = np.ravel(y)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
X_train_sparse = sparse_container(X_train)
|
||||
X_test_sparse = sparse_container(X_test)
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_classifier = AdaBoostClassifier(
|
||||
estimator=CustomSVC(probability=True),
|
||||
random_state=1,
|
||||
algorithm="SAMME",
|
||||
).fit(X_train_sparse, y_train)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = AdaBoostClassifier(
|
||||
estimator=CustomSVC(probability=True),
|
||||
random_state=1,
|
||||
algorithm="SAMME",
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# predict
|
||||
sparse_clf_results = sparse_classifier.predict(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.predict(X_test)
|
||||
assert_array_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# decision_function
|
||||
sparse_clf_results = sparse_classifier.decision_function(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.decision_function(X_test)
|
||||
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# predict_log_proba
|
||||
sparse_clf_results = sparse_classifier.predict_log_proba(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.predict_log_proba(X_test)
|
||||
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# predict_proba
|
||||
sparse_clf_results = sparse_classifier.predict_proba(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.predict_proba(X_test)
|
||||
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# score
|
||||
sparse_clf_results = sparse_classifier.score(X_test_sparse, y_test)
|
||||
dense_clf_results = dense_classifier.score(X_test, y_test)
|
||||
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# staged_decision_function
|
||||
sparse_clf_results = sparse_classifier.staged_decision_function(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.staged_decision_function(X_test)
|
||||
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||||
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
|
||||
|
||||
# staged_predict
|
||||
sparse_clf_results = sparse_classifier.staged_predict(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.staged_predict(X_test)
|
||||
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||||
assert_array_equal(sparse_clf_res, dense_clf_res)
|
||||
|
||||
# staged_predict_proba
|
||||
sparse_clf_results = sparse_classifier.staged_predict_proba(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.staged_predict_proba(X_test)
|
||||
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||||
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
|
||||
|
||||
# staged_score
|
||||
sparse_clf_results = sparse_classifier.staged_score(X_test_sparse, y_test)
|
||||
dense_clf_results = dense_classifier.staged_score(X_test, y_test)
|
||||
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||||
assert_array_equal(sparse_clf_res, dense_clf_res)
|
||||
|
||||
# Verify sparsity of data is maintained during training
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert all([t == expected_internal_type for t in types])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container, expected_internal_type",
|
||||
zip(
|
||||
[
|
||||
*CSC_CONTAINERS,
|
||||
*CSR_CONTAINERS,
|
||||
*LIL_CONTAINERS,
|
||||
*COO_CONTAINERS,
|
||||
*DOK_CONTAINERS,
|
||||
],
|
||||
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
|
||||
),
|
||||
)
|
||||
def test_sparse_regression(sparse_container, expected_internal_type):
|
||||
# Check regression with sparse input.
|
||||
|
||||
class CustomSVR(SVR):
|
||||
"""SVR variant that records the nature of the training set."""
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Modification on fit caries data type for later verification."""
|
||||
super().fit(X, y, sample_weight=sample_weight)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
X, y = datasets.make_regression(
|
||||
n_samples=15, n_features=50, n_targets=1, random_state=42
|
||||
)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
X_train_sparse = sparse_container(X_train)
|
||||
X_test_sparse = sparse_container(X_test)
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
|
||||
X_train_sparse, y_train
|
||||
)
|
||||
|
||||
# Trained on dense format
|
||||
dense_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
# predict
|
||||
sparse_regr_results = sparse_regressor.predict(X_test_sparse)
|
||||
dense_regr_results = dense_regressor.predict(X_test)
|
||||
assert_array_almost_equal(sparse_regr_results, dense_regr_results)
|
||||
|
||||
# staged_predict
|
||||
sparse_regr_results = sparse_regressor.staged_predict(X_test_sparse)
|
||||
dense_regr_results = dense_regressor.staged_predict(X_test)
|
||||
for sparse_regr_res, dense_regr_res in zip(sparse_regr_results, dense_regr_results):
|
||||
assert_array_almost_equal(sparse_regr_res, dense_regr_res)
|
||||
|
||||
types = [i.data_type_ for i in sparse_regressor.estimators_]
|
||||
|
||||
assert all([t == expected_internal_type for t in types])
|
||||
|
||||
|
||||
def test_sample_weight_adaboost_regressor():
|
||||
"""
|
||||
AdaBoostRegressor should work without sample_weights in the base estimator
|
||||
The random weighted sampling is done internally in the _boost method in
|
||||
AdaBoostRegressor.
|
||||
"""
|
||||
|
||||
class DummyEstimator(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
pass
|
||||
|
||||
def predict(self, X):
|
||||
return np.zeros(X.shape[0])
|
||||
|
||||
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
|
||||
boost.fit(X, y_regr)
|
||||
assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
|
||||
|
||||
|
||||
def test_multidimensional_X():
|
||||
"""
|
||||
Check that the AdaBoost estimators can work with n-dimensional
|
||||
data matrix
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = rng.randn(51, 3, 3)
|
||||
yc = rng.choice([0, 1], 51)
|
||||
yr = rng.randn(51)
|
||||
|
||||
boost = AdaBoostClassifier(
|
||||
DummyClassifier(strategy="most_frequent"), algorithm="SAMME"
|
||||
)
|
||||
boost.fit(X, yc)
|
||||
boost.predict(X)
|
||||
boost.predict_proba(X)
|
||||
|
||||
boost = AdaBoostRegressor(DummyRegressor())
|
||||
boost.fit(X, yr)
|
||||
boost.predict(X)
|
||||
|
||||
|
||||
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||||
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||||
# only consider "SAMME"
|
||||
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_adaboostclassifier_without_sample_weight(algorithm):
|
||||
X, y = iris.data, iris.target
|
||||
estimator = NoSampleWeightWrapper(DummyClassifier())
|
||||
clf = AdaBoostClassifier(estimator=estimator, algorithm=algorithm)
|
||||
err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_adaboostregressor_sample_weight():
|
||||
# check that giving weight will have an influence on the error computed
|
||||
# for a weak learner
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.linspace(0, 100, num=1000)
|
||||
y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
|
||||
X = X.reshape(-1, 1)
|
||||
|
||||
# add an arbitrary outlier
|
||||
X[-1] *= 10
|
||||
y[-1] = 10000
|
||||
|
||||
# random_state=0 ensure that the underlying bootstrap will use the outlier
|
||||
regr_no_outlier = AdaBoostRegressor(
|
||||
estimator=LinearRegression(), n_estimators=1, random_state=0
|
||||
)
|
||||
regr_with_weight = clone(regr_no_outlier)
|
||||
regr_with_outlier = clone(regr_no_outlier)
|
||||
|
||||
# fit 3 models:
|
||||
# - a model containing the outlier
|
||||
# - a model without the outlier
|
||||
# - a model containing the outlier but with a null sample-weight
|
||||
regr_with_outlier.fit(X, y)
|
||||
regr_no_outlier.fit(X[:-1], y[:-1])
|
||||
sample_weight = np.ones_like(y)
|
||||
sample_weight[-1] = 0
|
||||
regr_with_weight.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
|
||||
score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
|
||||
score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
|
||||
|
||||
assert score_with_outlier < score_no_outlier
|
||||
assert score_with_outlier < score_with_weight
|
||||
assert score_no_outlier == pytest.approx(score_with_weight)
|
||||
|
||||
|
||||
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||||
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||||
# only consider "SAMME"
|
||||
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_adaboost_consistent_predict(algorithm):
|
||||
# check that predict_proba and predict give consistent results
|
||||
# regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/14084
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
*datasets.load_digits(return_X_y=True), random_state=42
|
||||
)
|
||||
model = AdaBoostClassifier(algorithm=algorithm, random_state=42)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
assert_array_equal(
|
||||
np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model, X, y",
|
||||
[
|
||||
(AdaBoostClassifier(), iris.data, iris.target),
|
||||
(AdaBoostRegressor(), diabetes.data, diabetes.target),
|
||||
],
|
||||
)
|
||||
def test_adaboost_negative_weight_error(model, X, y):
|
||||
sample_weight = np.ones_like(y)
|
||||
sample_weight[-1] = -10
|
||||
|
||||
err_msg = "Negative values in data passed to `sample_weight`"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
model.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
def test_adaboost_numerically_stable_feature_importance_with_small_weights():
|
||||
"""Check that we don't create NaN feature importance with numerically
|
||||
instable inputs.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/20320
|
||||
"""
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.normal(size=(1000, 10))
|
||||
y = rng.choice([0, 1], size=1000)
|
||||
sample_weight = np.ones_like(y) * 1e-263
|
||||
tree = DecisionTreeClassifier(max_depth=10, random_state=12)
|
||||
ada_model = AdaBoostClassifier(
|
||||
estimator=tree, n_estimators=20, algorithm="SAMME", random_state=12
|
||||
)
|
||||
ada_model.fit(X, y, sample_weight=sample_weight)
|
||||
assert np.isnan(ada_model.feature_importances_).sum() == 0
|
||||
|
||||
|
||||
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||||
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||||
# only consider "SAMME"
|
||||
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_adaboost_decision_function(algorithm, global_random_seed):
|
||||
"""Check that the decision function respects the symmetric constraint for weak
|
||||
learners.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/26520
|
||||
"""
|
||||
n_classes = 3
|
||||
X, y = datasets.make_classification(
|
||||
n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed
|
||||
)
|
||||
clf = AdaBoostClassifier(
|
||||
n_estimators=1, random_state=global_random_seed, algorithm=algorithm
|
||||
).fit(X, y)
|
||||
|
||||
y_score = clf.decision_function(X)
|
||||
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||||
|
||||
if algorithm == "SAMME":
|
||||
# With a single learner, we expect to have a decision function in
|
||||
# {1, - 1 / (n_classes - 1)}.
|
||||
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
|
||||
|
||||
# We can assert the same for staged_decision_function since we have a single learner
|
||||
for y_score in clf.staged_decision_function(X):
|
||||
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||||
|
||||
if algorithm == "SAMME":
|
||||
# With a single learner, we expect to have a decision function in
|
||||
# {1, - 1 / (n_classes - 1)}.
|
||||
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
|
||||
|
||||
clf.set_params(n_estimators=5).fit(X, y)
|
||||
|
||||
y_score = clf.decision_function(X)
|
||||
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||||
|
||||
for y_score in clf.staged_decision_function(X):
|
||||
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||||
|
||||
|
||||
# TODO(1.6): remove
|
||||
def test_deprecated_samme_r_algorithm():
|
||||
adaboost_clf = AdaBoostClassifier(n_estimators=1)
|
||||
with pytest.warns(
|
||||
FutureWarning,
|
||||
match=re.escape("The SAMME.R algorithm (the default) is deprecated"),
|
||||
):
|
||||
adaboost_clf.fit(X, y_class)
|
||||
Reference in New Issue
Block a user