reconnect moved files to git repo

This commit is contained in:
root
2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions

View File

@ -0,0 +1,413 @@
import numpy as np
from ...base import is_regressor
from ...preprocessing import LabelEncoder
from ...utils import _safe_indexing
from ...utils._optional_dependencies import check_matplotlib_support
from ...utils._response import _get_response_values
from ...utils._set_output import _get_adapter_from_container
from ...utils.validation import (
_is_arraylike_not_scalar,
_is_pandas_df,
_is_polars_df,
_num_features,
check_is_fitted,
)
def _check_boundary_response_method(estimator, response_method, class_of_interest):
"""Validate the response methods to be used with the fitted estimator.
Parameters
----------
estimator : object
Fitted estimator to check.
response_method : {'auto', 'predict_proba', 'decision_function', 'predict'}
Specifies whether to use :term:`predict_proba`,
:term:`decision_function`, :term:`predict` as the target response.
If set to 'auto', the response method is tried in the following order:
:term:`decision_function`, :term:`predict_proba`, :term:`predict`.
class_of_interest : int, float, bool, str or None
The class considered when plotting the decision. Cannot be None if
multiclass and `response_method` is 'predict_proba' or 'decision_function'.
.. versionadded:: 1.4
Returns
-------
prediction_method : list of str or str
The name or list of names of the response methods to use.
"""
has_classes = hasattr(estimator, "classes_")
if has_classes and _is_arraylike_not_scalar(estimator.classes_[0]):
msg = "Multi-label and multi-output multi-class classifiers are not supported"
raise ValueError(msg)
if has_classes and len(estimator.classes_) > 2:
if response_method not in {"auto", "predict"} and class_of_interest is None:
msg = (
"Multiclass classifiers are only supported when `response_method` is "
"'predict' or 'auto'. Else you must provide `class_of_interest` to "
"plot the decision boundary of a specific class."
)
raise ValueError(msg)
prediction_method = "predict" if response_method == "auto" else response_method
elif response_method == "auto":
if is_regressor(estimator):
prediction_method = "predict"
else:
prediction_method = ["decision_function", "predict_proba", "predict"]
else:
prediction_method = response_method
return prediction_method
class DecisionBoundaryDisplay:
"""Decisions boundary visualization.
It is recommended to use
:func:`~sklearn.inspection.DecisionBoundaryDisplay.from_estimator`
to create a :class:`DecisionBoundaryDisplay`. All parameters are stored as
attributes.
Read more in the :ref:`User Guide <visualizations>`.
.. versionadded:: 1.1
Parameters
----------
xx0 : ndarray of shape (grid_resolution, grid_resolution)
First output of :func:`meshgrid <numpy.meshgrid>`.
xx1 : ndarray of shape (grid_resolution, grid_resolution)
Second output of :func:`meshgrid <numpy.meshgrid>`.
response : ndarray of shape (grid_resolution, grid_resolution)
Values of the response function.
xlabel : str, default=None
Default label to place on x axis.
ylabel : str, default=None
Default label to place on y axis.
Attributes
----------
surface_ : matplotlib `QuadContourSet` or `QuadMesh`
If `plot_method` is 'contour' or 'contourf', `surface_` is a
:class:`QuadContourSet <matplotlib.contour.QuadContourSet>`. If
`plot_method` is 'pcolormesh', `surface_` is a
:class:`QuadMesh <matplotlib.collections.QuadMesh>`.
ax_ : matplotlib Axes
Axes with decision boundary.
figure_ : matplotlib Figure
Figure containing the decision boundary.
See Also
--------
DecisionBoundaryDisplay.from_estimator : Plot decision boundary given an estimator.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> import numpy as np
>>> from sklearn.datasets import load_iris
>>> from sklearn.inspection import DecisionBoundaryDisplay
>>> from sklearn.tree import DecisionTreeClassifier
>>> iris = load_iris()
>>> feature_1, feature_2 = np.meshgrid(
... np.linspace(iris.data[:, 0].min(), iris.data[:, 0].max()),
... np.linspace(iris.data[:, 1].min(), iris.data[:, 1].max())
... )
>>> grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
>>> tree = DecisionTreeClassifier().fit(iris.data[:, :2], iris.target)
>>> y_pred = np.reshape(tree.predict(grid), feature_1.shape)
>>> display = DecisionBoundaryDisplay(
... xx0=feature_1, xx1=feature_2, response=y_pred
... )
>>> display.plot()
<...>
>>> display.ax_.scatter(
... iris.data[:, 0], iris.data[:, 1], c=iris.target, edgecolor="black"
... )
<...>
>>> plt.show()
"""
def __init__(self, *, xx0, xx1, response, xlabel=None, ylabel=None):
self.xx0 = xx0
self.xx1 = xx1
self.response = response
self.xlabel = xlabel
self.ylabel = ylabel
def plot(self, plot_method="contourf", ax=None, xlabel=None, ylabel=None, **kwargs):
"""Plot visualization.
Parameters
----------
plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf'
Plotting method to call when plotting the response. Please refer
to the following matplotlib documentation for details:
:func:`contourf <matplotlib.pyplot.contourf>`,
:func:`contour <matplotlib.pyplot.contour>`,
:func:`pcolormesh <matplotlib.pyplot.pcolormesh>`.
ax : Matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
xlabel : str, default=None
Overwrite the x-axis label.
ylabel : str, default=None
Overwrite the y-axis label.
**kwargs : dict
Additional keyword arguments to be passed to the `plot_method`.
Returns
-------
display: :class:`~sklearn.inspection.DecisionBoundaryDisplay`
Object that stores computed values.
"""
check_matplotlib_support("DecisionBoundaryDisplay.plot")
import matplotlib.pyplot as plt # noqa
if plot_method not in ("contourf", "contour", "pcolormesh"):
raise ValueError(
"plot_method must be 'contourf', 'contour', or 'pcolormesh'"
)
if ax is None:
_, ax = plt.subplots()
plot_func = getattr(ax, plot_method)
self.surface_ = plot_func(self.xx0, self.xx1, self.response, **kwargs)
if xlabel is not None or not ax.get_xlabel():
xlabel = self.xlabel if xlabel is None else xlabel
ax.set_xlabel(xlabel)
if ylabel is not None or not ax.get_ylabel():
ylabel = self.ylabel if ylabel is None else ylabel
ax.set_ylabel(ylabel)
self.ax_ = ax
self.figure_ = ax.figure
return self
@classmethod
def from_estimator(
cls,
estimator,
X,
*,
grid_resolution=100,
eps=1.0,
plot_method="contourf",
response_method="auto",
class_of_interest=None,
xlabel=None,
ylabel=None,
ax=None,
**kwargs,
):
"""Plot decision boundary given an estimator.
Read more in the :ref:`User Guide <visualizations>`.
Parameters
----------
estimator : object
Trained estimator used to plot the decision boundary.
X : {array-like, sparse matrix, dataframe} of shape (n_samples, 2)
Input data that should be only 2-dimensional.
grid_resolution : int, default=100
Number of grid points to use for plotting decision boundary.
Higher values will make the plot look nicer but be slower to
render.
eps : float, default=1.0
Extends the minimum and maximum values of X for evaluating the
response function.
plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf'
Plotting method to call when plotting the response. Please refer
to the following matplotlib documentation for details:
:func:`contourf <matplotlib.pyplot.contourf>`,
:func:`contour <matplotlib.pyplot.contour>`,
:func:`pcolormesh <matplotlib.pyplot.pcolormesh>`.
response_method : {'auto', 'predict_proba', 'decision_function', \
'predict'}, default='auto'
Specifies whether to use :term:`predict_proba`,
:term:`decision_function`, :term:`predict` as the target response.
If set to 'auto', the response method is tried in the following order:
:term:`decision_function`, :term:`predict_proba`, :term:`predict`.
For multiclass problems, :term:`predict` is selected when
`response_method="auto"`.
class_of_interest : int, float, bool or str, default=None
The class considered when plotting the decision. If None,
`estimator.classes_[1]` is considered as the positive class
for binary classifiers. Must have an explicit value for
multiclass classifiers when `response_method` is 'predict_proba'
or 'decision_function'.
.. versionadded:: 1.4
xlabel : str, default=None
The label used for the x-axis. If `None`, an attempt is made to
extract a label from `X` if it is a dataframe, otherwise an empty
string is used.
ylabel : str, default=None
The label used for the y-axis. If `None`, an attempt is made to
extract a label from `X` if it is a dataframe, otherwise an empty
string is used.
ax : Matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
**kwargs : dict
Additional keyword arguments to be passed to the
`plot_method`.
Returns
-------
display : :class:`~sklearn.inspection.DecisionBoundaryDisplay`
Object that stores the result.
See Also
--------
DecisionBoundaryDisplay : Decision boundary visualization.
sklearn.metrics.ConfusionMatrixDisplay.from_estimator : Plot the
confusion matrix given an estimator, the data, and the label.
sklearn.metrics.ConfusionMatrixDisplay.from_predictions : Plot the
confusion matrix given the true and predicted labels.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import load_iris
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.inspection import DecisionBoundaryDisplay
>>> iris = load_iris()
>>> X = iris.data[:, :2]
>>> classifier = LogisticRegression().fit(X, iris.target)
>>> disp = DecisionBoundaryDisplay.from_estimator(
... classifier, X, response_method="predict",
... xlabel=iris.feature_names[0], ylabel=iris.feature_names[1],
... alpha=0.5,
... )
>>> disp.ax_.scatter(X[:, 0], X[:, 1], c=iris.target, edgecolor="k")
<...>
>>> plt.show()
"""
check_matplotlib_support(f"{cls.__name__}.from_estimator")
check_is_fitted(estimator)
if not grid_resolution > 1:
raise ValueError(
"grid_resolution must be greater than 1. Got"
f" {grid_resolution} instead."
)
if not eps >= 0:
raise ValueError(
f"eps must be greater than or equal to 0. Got {eps} instead."
)
possible_plot_methods = ("contourf", "contour", "pcolormesh")
if plot_method not in possible_plot_methods:
available_methods = ", ".join(possible_plot_methods)
raise ValueError(
f"plot_method must be one of {available_methods}. "
f"Got {plot_method} instead."
)
num_features = _num_features(X)
if num_features != 2:
raise ValueError(
f"n_features must be equal to 2. Got {num_features} instead."
)
x0, x1 = _safe_indexing(X, 0, axis=1), _safe_indexing(X, 1, axis=1)
x0_min, x0_max = x0.min() - eps, x0.max() + eps
x1_min, x1_max = x1.min() - eps, x1.max() + eps
xx0, xx1 = np.meshgrid(
np.linspace(x0_min, x0_max, grid_resolution),
np.linspace(x1_min, x1_max, grid_resolution),
)
X_grid = np.c_[xx0.ravel(), xx1.ravel()]
if _is_pandas_df(X) or _is_polars_df(X):
adapter = _get_adapter_from_container(X)
X_grid = adapter.create_container(
X_grid,
X_grid,
columns=X.columns,
)
prediction_method = _check_boundary_response_method(
estimator, response_method, class_of_interest
)
try:
response, _, response_method_used = _get_response_values(
estimator,
X_grid,
response_method=prediction_method,
pos_label=class_of_interest,
return_response_method_used=True,
)
except ValueError as exc:
if "is not a valid label" in str(exc):
# re-raise a more informative error message since `pos_label` is unknown
# to our user when interacting with
# `DecisionBoundaryDisplay.from_estimator`
raise ValueError(
f"class_of_interest={class_of_interest} is not a valid label: It "
f"should be one of {estimator.classes_}"
) from exc
raise
# convert classes predictions into integers
if response_method_used == "predict" and hasattr(estimator, "classes_"):
encoder = LabelEncoder()
encoder.classes_ = estimator.classes_
response = encoder.transform(response)
if response.ndim != 1:
if is_regressor(estimator):
raise ValueError("Multi-output regressors are not supported")
# For the multiclass case, `_get_response_values` returns the response
# as-is. Thus, we have a column per class and we need to select the column
# corresponding to the positive class.
col_idx = np.flatnonzero(estimator.classes_ == class_of_interest)[0]
response = response[:, col_idx]
if xlabel is None:
xlabel = X.columns[0] if hasattr(X, "columns") else ""
if ylabel is None:
ylabel = X.columns[1] if hasattr(X, "columns") else ""
display = cls(
xx0=xx0,
xx1=xx1,
response=response.reshape(xx0.shape),
xlabel=xlabel,
ylabel=ylabel,
)
return display.plot(ax=ax, plot_method=plot_method, **kwargs)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,613 @@
import warnings
import numpy as np
import pytest
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.datasets import (
load_diabetes,
load_iris,
make_classification,
make_multilabel_classification,
)
from sklearn.ensemble import IsolationForest
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.inspection._plot.decision_boundary import _check_boundary_response_method
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils._testing import (
_convert_container,
assert_allclose,
assert_array_equal,
)
# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved
pytestmark = pytest.mark.filterwarnings(
"ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:"
"matplotlib.*"
)
X, y = make_classification(
n_informative=1,
n_redundant=1,
n_clusters_per_class=1,
n_features=2,
random_state=42,
)
def load_iris_2d_scaled():
X, y = load_iris(return_X_y=True)
X = scale(X)[:, :2]
return X, y
@pytest.fixture(scope="module")
def fitted_clf():
return LogisticRegression().fit(X, y)
def test_input_data_dimension(pyplot):
"""Check that we raise an error when `X` does not have exactly 2 features."""
X, y = make_classification(n_samples=10, n_features=4, random_state=0)
clf = LogisticRegression().fit(X, y)
msg = "n_features must be equal to 2. Got 4 instead."
with pytest.raises(ValueError, match=msg):
DecisionBoundaryDisplay.from_estimator(estimator=clf, X=X)
def test_check_boundary_response_method_error():
"""Check that we raise an error for the cases not supported by
`_check_boundary_response_method`.
"""
class MultiLabelClassifier:
classes_ = [np.array([0, 1]), np.array([0, 1])]
err_msg = "Multi-label and multi-output multi-class classifiers are not supported"
with pytest.raises(ValueError, match=err_msg):
_check_boundary_response_method(MultiLabelClassifier(), "predict", None)
class MulticlassClassifier:
classes_ = [0, 1, 2]
err_msg = "Multiclass classifiers are only supported when `response_method` is"
for response_method in ("predict_proba", "decision_function"):
with pytest.raises(ValueError, match=err_msg):
_check_boundary_response_method(
MulticlassClassifier(), response_method, None
)
@pytest.mark.parametrize(
"estimator, response_method, class_of_interest, expected_prediction_method",
[
(DecisionTreeRegressor(), "predict", None, "predict"),
(DecisionTreeRegressor(), "auto", None, "predict"),
(LogisticRegression().fit(*load_iris_2d_scaled()), "predict", None, "predict"),
(LogisticRegression().fit(*load_iris_2d_scaled()), "auto", None, "predict"),
(
LogisticRegression().fit(*load_iris_2d_scaled()),
"predict_proba",
0,
"predict_proba",
),
(
LogisticRegression().fit(*load_iris_2d_scaled()),
"decision_function",
0,
"decision_function",
),
(
LogisticRegression().fit(X, y),
"auto",
None,
["decision_function", "predict_proba", "predict"],
),
(LogisticRegression().fit(X, y), "predict", None, "predict"),
(
LogisticRegression().fit(X, y),
["predict_proba", "decision_function"],
None,
["predict_proba", "decision_function"],
),
],
)
def test_check_boundary_response_method(
estimator, response_method, class_of_interest, expected_prediction_method
):
"""Check the behaviour of `_check_boundary_response_method` for the supported
cases.
"""
prediction_method = _check_boundary_response_method(
estimator, response_method, class_of_interest
)
assert prediction_method == expected_prediction_method
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
def test_multiclass_error(pyplot, response_method):
"""Check multiclass errors."""
X, y = make_classification(n_classes=3, n_informative=3, random_state=0)
X = X[:, [0, 1]]
lr = LogisticRegression().fit(X, y)
msg = (
"Multiclass classifiers are only supported when `response_method` is 'predict'"
" or 'auto'"
)
with pytest.raises(ValueError, match=msg):
DecisionBoundaryDisplay.from_estimator(lr, X, response_method=response_method)
@pytest.mark.parametrize("response_method", ["auto", "predict"])
def test_multiclass(pyplot, response_method):
"""Check multiclass gives expected results."""
grid_resolution = 10
eps = 1.0
X, y = make_classification(n_classes=3, n_informative=3, random_state=0)
X = X[:, [0, 1]]
lr = LogisticRegression(random_state=0).fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(
lr, X, response_method=response_method, grid_resolution=grid_resolution, eps=1.0
)
x0_min, x0_max = X[:, 0].min() - eps, X[:, 0].max() + eps
x1_min, x1_max = X[:, 1].min() - eps, X[:, 1].max() + eps
xx0, xx1 = np.meshgrid(
np.linspace(x0_min, x0_max, grid_resolution),
np.linspace(x1_min, x1_max, grid_resolution),
)
response = lr.predict(np.c_[xx0.ravel(), xx1.ravel()])
assert_allclose(disp.response, response.reshape(xx0.shape))
assert_allclose(disp.xx0, xx0)
assert_allclose(disp.xx1, xx1)
@pytest.mark.parametrize(
"kwargs, error_msg",
[
(
{"plot_method": "hello_world"},
r"plot_method must be one of contourf, contour, pcolormesh. Got hello_world"
r" instead.",
),
(
{"grid_resolution": 1},
r"grid_resolution must be greater than 1. Got 1 instead",
),
(
{"grid_resolution": -1},
r"grid_resolution must be greater than 1. Got -1 instead",
),
({"eps": -1.1}, r"eps must be greater than or equal to 0. Got -1.1 instead"),
],
)
def test_input_validation_errors(pyplot, kwargs, error_msg, fitted_clf):
"""Check input validation from_estimator."""
with pytest.raises(ValueError, match=error_msg):
DecisionBoundaryDisplay.from_estimator(fitted_clf, X, **kwargs)
def test_display_plot_input_error(pyplot, fitted_clf):
"""Check input validation for `plot`."""
disp = DecisionBoundaryDisplay.from_estimator(fitted_clf, X, grid_resolution=5)
with pytest.raises(ValueError, match="plot_method must be 'contourf'"):
disp.plot(plot_method="hello_world")
@pytest.mark.parametrize(
"response_method", ["auto", "predict", "predict_proba", "decision_function"]
)
@pytest.mark.parametrize("plot_method", ["contourf", "contour"])
def test_decision_boundary_display_classifier(
pyplot, fitted_clf, response_method, plot_method
):
"""Check that decision boundary is correct."""
fig, ax = pyplot.subplots()
eps = 2.0
disp = DecisionBoundaryDisplay.from_estimator(
fitted_clf,
X,
grid_resolution=5,
response_method=response_method,
plot_method=plot_method,
eps=eps,
ax=ax,
)
assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet)
assert disp.ax_ == ax
assert disp.figure_ == fig
x0, x1 = X[:, 0], X[:, 1]
x0_min, x0_max = x0.min() - eps, x0.max() + eps
x1_min, x1_max = x1.min() - eps, x1.max() + eps
assert disp.xx0.min() == pytest.approx(x0_min)
assert disp.xx0.max() == pytest.approx(x0_max)
assert disp.xx1.min() == pytest.approx(x1_min)
assert disp.xx1.max() == pytest.approx(x1_max)
fig2, ax2 = pyplot.subplots()
# change plotting method for second plot
disp.plot(plot_method="pcolormesh", ax=ax2, shading="auto")
assert isinstance(disp.surface_, pyplot.matplotlib.collections.QuadMesh)
assert disp.ax_ == ax2
assert disp.figure_ == fig2
@pytest.mark.parametrize("response_method", ["auto", "predict", "decision_function"])
@pytest.mark.parametrize("plot_method", ["contourf", "contour"])
def test_decision_boundary_display_outlier_detector(
pyplot, response_method, plot_method
):
"""Check that decision boundary is correct for outlier detector."""
fig, ax = pyplot.subplots()
eps = 2.0
outlier_detector = IsolationForest(random_state=0).fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(
outlier_detector,
X,
grid_resolution=5,
response_method=response_method,
plot_method=plot_method,
eps=eps,
ax=ax,
)
assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet)
assert disp.ax_ == ax
assert disp.figure_ == fig
x0, x1 = X[:, 0], X[:, 1]
x0_min, x0_max = x0.min() - eps, x0.max() + eps
x1_min, x1_max = x1.min() - eps, x1.max() + eps
assert disp.xx0.min() == pytest.approx(x0_min)
assert disp.xx0.max() == pytest.approx(x0_max)
assert disp.xx1.min() == pytest.approx(x1_min)
assert disp.xx1.max() == pytest.approx(x1_max)
@pytest.mark.parametrize("response_method", ["auto", "predict"])
@pytest.mark.parametrize("plot_method", ["contourf", "contour"])
def test_decision_boundary_display_regressor(pyplot, response_method, plot_method):
"""Check that we can display the decision boundary for a regressor."""
X, y = load_diabetes(return_X_y=True)
X = X[:, :2]
tree = DecisionTreeRegressor().fit(X, y)
fig, ax = pyplot.subplots()
eps = 2.0
disp = DecisionBoundaryDisplay.from_estimator(
tree,
X,
response_method=response_method,
ax=ax,
eps=eps,
plot_method=plot_method,
)
assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet)
assert disp.ax_ == ax
assert disp.figure_ == fig
x0, x1 = X[:, 0], X[:, 1]
x0_min, x0_max = x0.min() - eps, x0.max() + eps
x1_min, x1_max = x1.min() - eps, x1.max() + eps
assert disp.xx0.min() == pytest.approx(x0_min)
assert disp.xx0.max() == pytest.approx(x0_max)
assert disp.xx1.min() == pytest.approx(x1_min)
assert disp.xx1.max() == pytest.approx(x1_max)
fig2, ax2 = pyplot.subplots()
# change plotting method for second plot
disp.plot(plot_method="pcolormesh", ax=ax2, shading="auto")
assert isinstance(disp.surface_, pyplot.matplotlib.collections.QuadMesh)
assert disp.ax_ == ax2
assert disp.figure_ == fig2
@pytest.mark.parametrize(
"response_method, msg",
[
(
"predict_proba",
"MyClassifier has none of the following attributes: predict_proba",
),
(
"decision_function",
"MyClassifier has none of the following attributes: decision_function",
),
(
"auto",
(
"MyClassifier has none of the following attributes: decision_function, "
"predict_proba, predict"
),
),
(
"bad_method",
"MyClassifier has none of the following attributes: bad_method",
),
],
)
def test_error_bad_response(pyplot, response_method, msg):
"""Check errors for bad response."""
class MyClassifier(BaseEstimator, ClassifierMixin):
def fit(self, X, y):
self.fitted_ = True
self.classes_ = [0, 1]
return self
clf = MyClassifier().fit(X, y)
with pytest.raises(AttributeError, match=msg):
DecisionBoundaryDisplay.from_estimator(clf, X, response_method=response_method)
@pytest.mark.parametrize("response_method", ["auto", "predict", "predict_proba"])
def test_multilabel_classifier_error(pyplot, response_method):
"""Check that multilabel classifier raises correct error."""
X, y = make_multilabel_classification(random_state=0)
X = X[:, :2]
tree = DecisionTreeClassifier().fit(X, y)
msg = "Multi-label and multi-output multi-class classifiers are not supported"
with pytest.raises(ValueError, match=msg):
DecisionBoundaryDisplay.from_estimator(
tree,
X,
response_method=response_method,
)
@pytest.mark.parametrize("response_method", ["auto", "predict", "predict_proba"])
def test_multi_output_multi_class_classifier_error(pyplot, response_method):
"""Check that multi-output multi-class classifier raises correct error."""
X = np.asarray([[0, 1], [1, 2]])
y = np.asarray([["tree", "cat"], ["cat", "tree"]])
tree = DecisionTreeClassifier().fit(X, y)
msg = "Multi-label and multi-output multi-class classifiers are not supported"
with pytest.raises(ValueError, match=msg):
DecisionBoundaryDisplay.from_estimator(
tree,
X,
response_method=response_method,
)
def test_multioutput_regressor_error(pyplot):
"""Check that multioutput regressor raises correct error."""
X = np.asarray([[0, 1], [1, 2]])
y = np.asarray([[0, 1], [4, 1]])
tree = DecisionTreeRegressor().fit(X, y)
with pytest.raises(ValueError, match="Multi-output regressors are not supported"):
DecisionBoundaryDisplay.from_estimator(tree, X, response_method="predict")
@pytest.mark.parametrize(
"response_method",
["predict_proba", "decision_function", ["predict_proba", "predict"]],
)
def test_regressor_unsupported_response(pyplot, response_method):
"""Check that we can display the decision boundary for a regressor."""
X, y = load_diabetes(return_X_y=True)
X = X[:, :2]
tree = DecisionTreeRegressor().fit(X, y)
err_msg = "should either be a classifier to be used with response_method"
with pytest.raises(ValueError, match=err_msg):
DecisionBoundaryDisplay.from_estimator(tree, X, response_method=response_method)
@pytest.mark.filterwarnings(
# We expect to raise the following warning because the classifier is fit on a
# NumPy array
"ignore:X has feature names, but LogisticRegression was fitted without"
)
def test_dataframe_labels_used(pyplot, fitted_clf):
"""Check that column names are used for pandas."""
pd = pytest.importorskip("pandas")
df = pd.DataFrame(X, columns=["col_x", "col_y"])
# pandas column names are used by default
_, ax = pyplot.subplots()
disp = DecisionBoundaryDisplay.from_estimator(fitted_clf, df, ax=ax)
assert ax.get_xlabel() == "col_x"
assert ax.get_ylabel() == "col_y"
# second call to plot will have the names
fig, ax = pyplot.subplots()
disp.plot(ax=ax)
assert ax.get_xlabel() == "col_x"
assert ax.get_ylabel() == "col_y"
# axes with a label will not get overridden
fig, ax = pyplot.subplots()
ax.set(xlabel="hello", ylabel="world")
disp.plot(ax=ax)
assert ax.get_xlabel() == "hello"
assert ax.get_ylabel() == "world"
# labels get overridden only if provided to the `plot` method
disp.plot(ax=ax, xlabel="overwritten_x", ylabel="overwritten_y")
assert ax.get_xlabel() == "overwritten_x"
assert ax.get_ylabel() == "overwritten_y"
# labels do not get inferred if provided to `from_estimator`
_, ax = pyplot.subplots()
disp = DecisionBoundaryDisplay.from_estimator(
fitted_clf, df, ax=ax, xlabel="overwritten_x", ylabel="overwritten_y"
)
assert ax.get_xlabel() == "overwritten_x"
assert ax.get_ylabel() == "overwritten_y"
def test_string_target(pyplot):
"""Check that decision boundary works with classifiers trained on string labels."""
iris = load_iris()
X = iris.data[:, [0, 1]]
# Use strings as target
y = iris.target_names[iris.target]
log_reg = LogisticRegression().fit(X, y)
# Does not raise
DecisionBoundaryDisplay.from_estimator(
log_reg,
X,
grid_resolution=5,
response_method="predict",
)
@pytest.mark.parametrize("constructor_name", ["pandas", "polars"])
def test_dataframe_support(pyplot, constructor_name):
"""Check that passing a dataframe at fit and to the Display does not
raise warnings.
Non-regression test for:
* https://github.com/scikit-learn/scikit-learn/issues/23311
* https://github.com/scikit-learn/scikit-learn/issues/28717
"""
df = _convert_container(
X, constructor_name=constructor_name, columns_name=["col_x", "col_y"]
)
estimator = LogisticRegression().fit(df, y)
with warnings.catch_warnings():
# no warnings linked to feature names validation should be raised
warnings.simplefilter("error", UserWarning)
DecisionBoundaryDisplay.from_estimator(estimator, df, response_method="predict")
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
def test_class_of_interest_binary(pyplot, response_method):
"""Check the behaviour of passing `class_of_interest` for plotting the output of
`predict_proba` and `decision_function` in the binary case.
"""
iris = load_iris()
X = iris.data[:100, :2]
y = iris.target[:100]
assert_array_equal(np.unique(y), [0, 1])
estimator = LogisticRegression().fit(X, y)
# We will check that `class_of_interest=None` is equivalent to
# `class_of_interest=estimator.classes_[1]`
disp_default = DecisionBoundaryDisplay.from_estimator(
estimator,
X,
response_method=response_method,
class_of_interest=None,
)
disp_class_1 = DecisionBoundaryDisplay.from_estimator(
estimator,
X,
response_method=response_method,
class_of_interest=estimator.classes_[1],
)
assert_allclose(disp_default.response, disp_class_1.response)
# we can check that `_get_response_values` modifies the response when targeting
# the other class, i.e. 1 - p(y=1|x) for `predict_proba` and -decision_function
# for `decision_function`.
disp_class_0 = DecisionBoundaryDisplay.from_estimator(
estimator,
X,
response_method=response_method,
class_of_interest=estimator.classes_[0],
)
if response_method == "predict_proba":
assert_allclose(disp_default.response, 1 - disp_class_0.response)
else:
assert response_method == "decision_function"
assert_allclose(disp_default.response, -disp_class_0.response)
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
def test_class_of_interest_multiclass(pyplot, response_method):
"""Check the behaviour of passing `class_of_interest` for plotting the output of
`predict_proba` and `decision_function` in the multiclass case.
"""
iris = load_iris()
X = iris.data[:, :2]
y = iris.target # the target are numerical labels
class_of_interest_idx = 2
estimator = LogisticRegression().fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(
estimator,
X,
response_method=response_method,
class_of_interest=class_of_interest_idx,
)
# we will check that we plot the expected values as response
grid = np.concatenate([disp.xx0.reshape(-1, 1), disp.xx1.reshape(-1, 1)], axis=1)
response = getattr(estimator, response_method)(grid)[:, class_of_interest_idx]
assert_allclose(response.reshape(*disp.response.shape), disp.response)
# make the same test but this time using target as strings
y = iris.target_names[iris.target]
estimator = LogisticRegression().fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(
estimator,
X,
response_method=response_method,
class_of_interest=iris.target_names[class_of_interest_idx],
)
grid = np.concatenate([disp.xx0.reshape(-1, 1), disp.xx1.reshape(-1, 1)], axis=1)
response = getattr(estimator, response_method)(grid)[:, class_of_interest_idx]
assert_allclose(response.reshape(*disp.response.shape), disp.response)
# check that we raise an error for unknown labels
# this test should already be handled in `_get_response_values` but we can have this
# test here as well
err_msg = "class_of_interest=2 is not a valid label: It should be one of"
with pytest.raises(ValueError, match=err_msg):
DecisionBoundaryDisplay.from_estimator(
estimator,
X,
response_method=response_method,
class_of_interest=class_of_interest_idx,
)
# TODO: remove this test when we handle multiclass with class_of_interest=None
# by showing the max of the decision function or the max of the predicted
# probabilities.
err_msg = "Multiclass classifiers are only supported"
with pytest.raises(ValueError, match=err_msg):
DecisionBoundaryDisplay.from_estimator(
estimator,
X,
response_method=response_method,
class_of_interest=None,
)
def test_subclass_named_constructors_return_type_is_subclass(pyplot):
"""Check that named constructors return the correct type when subclassed.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/pull/27675
"""
clf = LogisticRegression().fit(X, y)
class SubclassOfDisplay(DecisionBoundaryDisplay):
pass
curve = SubclassOfDisplay.from_estimator(estimator=clf, X=X)
assert isinstance(curve, SubclassOfDisplay)