some new features
This commit is contained in:
@ -0,0 +1,613 @@
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn import config_context, datasets
|
||||
from sklearn.model_selection import ShuffleSplit
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
|
||||
from sklearn.utils._testing import (
|
||||
_array_api_for_tests,
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.estimator_checks import _NotAnArray
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
from sklearn.utils.metaestimators import _safe_split
|
||||
from sklearn.utils.multiclass import (
|
||||
_ovr_decision_function,
|
||||
check_classification_targets,
|
||||
class_distribution,
|
||||
is_multilabel,
|
||||
type_of_target,
|
||||
unique_labels,
|
||||
)
|
||||
|
||||
multilabel_explicit_zero = np.array([[0, 1], [1, 0]])
|
||||
multilabel_explicit_zero[:, 0] = 0
|
||||
|
||||
|
||||
def _generate_sparse(
|
||||
data,
|
||||
sparse_containers=tuple(
|
||||
COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS
|
||||
),
|
||||
dtypes=(bool, int, np.int8, np.uint8, float, np.float32),
|
||||
):
|
||||
return [
|
||||
sparse_container(data, dtype=dtype)
|
||||
for sparse_container in sparse_containers
|
||||
for dtype in dtypes
|
||||
]
|
||||
|
||||
|
||||
EXAMPLES = {
|
||||
"multilabel-indicator": [
|
||||
# valid when the data is formatted as sparse or dense, identified
|
||||
# by CSR format when the testing takes place
|
||||
*_generate_sparse(
|
||||
np.random.RandomState(42).randint(2, size=(10, 10)),
|
||||
sparse_containers=CSR_CONTAINERS,
|
||||
dtypes=(int,),
|
||||
),
|
||||
[[0, 1], [1, 0]],
|
||||
[[0, 1]],
|
||||
*_generate_sparse(
|
||||
multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,)
|
||||
),
|
||||
*_generate_sparse([[0, 1], [1, 0]]),
|
||||
*_generate_sparse([[0, 0], [0, 0]]),
|
||||
*_generate_sparse([[0, 1]]),
|
||||
# Only valid when data is dense
|
||||
[[-1, 1], [1, -1]],
|
||||
np.array([[-1, 1], [1, -1]]),
|
||||
np.array([[-3, 3], [3, -3]]),
|
||||
_NotAnArray(np.array([[-3, 3], [3, -3]])),
|
||||
],
|
||||
"multiclass": [
|
||||
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
|
||||
np.array([1, 0, 2]),
|
||||
np.array([1, 0, 2], dtype=np.int8),
|
||||
np.array([1, 0, 2], dtype=np.uint8),
|
||||
np.array([1, 0, 2], dtype=float),
|
||||
np.array([1, 0, 2], dtype=np.float32),
|
||||
np.array([[1], [0], [2]]),
|
||||
_NotAnArray(np.array([1, 0, 2])),
|
||||
[0, 1, 2],
|
||||
["a", "b", "c"],
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
],
|
||||
"multiclass-multioutput": [
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
[["a", "b"], ["c", "d"]],
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
|
||||
*_generate_sparse(
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
|
||||
dtypes=(int, np.int8, np.uint8, float, np.float32),
|
||||
),
|
||||
np.array([["a", "b"], ["c", "d"]]),
|
||||
np.array([["a", "b"], ["c", "d"]]),
|
||||
np.array([["a", "b"], ["c", "d"]], dtype=object),
|
||||
np.array([[1, 0, 2]]),
|
||||
_NotAnArray(np.array([[1, 0, 2]])),
|
||||
],
|
||||
"binary": [
|
||||
[0, 1],
|
||||
[1, 1],
|
||||
[],
|
||||
[0],
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
|
||||
np.array([[0], [1]]),
|
||||
_NotAnArray(np.array([[0], [1]])),
|
||||
[1, -1],
|
||||
[3, 5],
|
||||
["a"],
|
||||
["a", "b"],
|
||||
["abc", "def"],
|
||||
np.array(["abc", "def"]),
|
||||
["a", "b"],
|
||||
np.array(["abc", "def"], dtype=object),
|
||||
],
|
||||
"continuous": [
|
||||
[1e-5],
|
||||
[0, 0.5],
|
||||
np.array([[0], [0.5]]),
|
||||
np.array([[0], [0.5]], dtype=np.float32),
|
||||
],
|
||||
"continuous-multioutput": [
|
||||
np.array([[0, 0.5], [0.5, 0]]),
|
||||
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
|
||||
np.array([[0, 0.5]]),
|
||||
*_generate_sparse(
|
||||
[[0, 0.5], [0.5, 0]],
|
||||
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
|
||||
dtypes=(float, np.float32),
|
||||
),
|
||||
*_generate_sparse(
|
||||
[[0, 0.5]],
|
||||
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
|
||||
dtypes=(float, np.float32),
|
||||
),
|
||||
],
|
||||
"unknown": [
|
||||
[[]],
|
||||
np.array([[]], dtype=object),
|
||||
[()],
|
||||
# sequence of sequences that weren't supported even before deprecation
|
||||
np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
|
||||
[np.array([]), np.array([1, 2, 3])],
|
||||
[{1, 2, 3}, {1, 2}],
|
||||
[frozenset([1, 2, 3]), frozenset([1, 2])],
|
||||
# and also confusable as sequences of sequences
|
||||
[{0: "a", 1: "b"}, {0: "a"}],
|
||||
# ndim 0
|
||||
np.array(0),
|
||||
# empty second dimension
|
||||
np.array([[], []]),
|
||||
# 3d
|
||||
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
|
||||
],
|
||||
}
|
||||
|
||||
ARRAY_API_EXAMPLES = {
|
||||
"multilabel-indicator": [
|
||||
np.random.RandomState(42).randint(2, size=(10, 10)),
|
||||
[[0, 1], [1, 0]],
|
||||
[[0, 1]],
|
||||
multilabel_explicit_zero,
|
||||
[[0, 0], [0, 0]],
|
||||
[[-1, 1], [1, -1]],
|
||||
np.array([[-1, 1], [1, -1]]),
|
||||
np.array([[-3, 3], [3, -3]]),
|
||||
_NotAnArray(np.array([[-3, 3], [3, -3]])),
|
||||
],
|
||||
"multiclass": [
|
||||
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
|
||||
np.array([1, 0, 2]),
|
||||
np.array([1, 0, 2], dtype=np.int8),
|
||||
np.array([1, 0, 2], dtype=np.uint8),
|
||||
np.array([1, 0, 2], dtype=float),
|
||||
np.array([1, 0, 2], dtype=np.float32),
|
||||
np.array([[1], [0], [2]]),
|
||||
_NotAnArray(np.array([1, 0, 2])),
|
||||
[0, 1, 2],
|
||||
],
|
||||
"multiclass-multioutput": [
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
|
||||
np.array([[1, 0, 2]]),
|
||||
_NotAnArray(np.array([[1, 0, 2]])),
|
||||
],
|
||||
"binary": [
|
||||
[0, 1],
|
||||
[1, 1],
|
||||
[],
|
||||
[0],
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
|
||||
np.array([[0], [1]]),
|
||||
_NotAnArray(np.array([[0], [1]])),
|
||||
[1, -1],
|
||||
[3, 5],
|
||||
],
|
||||
"continuous": [
|
||||
[1e-5],
|
||||
[0, 0.5],
|
||||
np.array([[0], [0.5]]),
|
||||
np.array([[0], [0.5]], dtype=np.float32),
|
||||
],
|
||||
"continuous-multioutput": [
|
||||
np.array([[0, 0.5], [0.5, 0]]),
|
||||
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
|
||||
np.array([[0, 0.5]]),
|
||||
],
|
||||
"unknown": [
|
||||
[[]],
|
||||
[()],
|
||||
np.array(0),
|
||||
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
NON_ARRAY_LIKE_EXAMPLES = [
|
||||
{1, 2, 3},
|
||||
{0: "a", 1: "b"},
|
||||
{0: [5], 1: [5]},
|
||||
"abc",
|
||||
frozenset([1, 2, 3]),
|
||||
None,
|
||||
]
|
||||
|
||||
MULTILABEL_SEQUENCES = [
|
||||
[[1], [2], [0, 1]],
|
||||
[(), (2), (0, 1)],
|
||||
np.array([[], [1, 2]], dtype="object"),
|
||||
_NotAnArray(np.array([[], [1, 2]], dtype="object")),
|
||||
]
|
||||
|
||||
|
||||
def test_unique_labels():
|
||||
# Empty iterable
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels()
|
||||
|
||||
# Multiclass problem
|
||||
assert_array_equal(unique_labels(range(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
|
||||
|
||||
# Multilabel indicator
|
||||
assert_array_equal(
|
||||
unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3)
|
||||
)
|
||||
|
||||
assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3))
|
||||
|
||||
# Several arrays passed
|
||||
assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5))
|
||||
assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3))
|
||||
|
||||
# Border line case with binary indicator matrix
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([4, 0, 2], np.ones((5, 5)))
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(np.ones((5, 4)), np.ones((5, 5)))
|
||||
|
||||
assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5))
|
||||
|
||||
|
||||
def test_unique_labels_non_specific():
|
||||
# Test unique_labels with a variety of collected examples
|
||||
|
||||
# Smoke test for all supported format
|
||||
for format in ["binary", "multiclass", "multilabel-indicator"]:
|
||||
for y in EXAMPLES[format]:
|
||||
unique_labels(y)
|
||||
|
||||
# We don't support those format at the moment
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
for y_type in [
|
||||
"unknown",
|
||||
"continuous",
|
||||
"continuous-multioutput",
|
||||
"multiclass-multioutput",
|
||||
]:
|
||||
for example in EXAMPLES[y_type]:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
|
||||
def test_unique_labels_mixed_types():
|
||||
# Mix with binary or multiclass and multilabel
|
||||
mix_clf_format = product(
|
||||
EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"]
|
||||
)
|
||||
|
||||
for y_multilabel, y_multiclass in mix_clf_format:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multiclass, y_multilabel)
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multilabel, y_multiclass)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([[1, 2]], [["a", "d"]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(["1", 2])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", 2], [1, 3]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", "2"], [2, 3]])
|
||||
|
||||
|
||||
def test_is_multilabel():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
dense_exp = group == "multilabel-indicator"
|
||||
|
||||
for example in group_examples:
|
||||
# Only mark explicitly defined sparse examples as valid sparse
|
||||
# multilabel-indicators
|
||||
sparse_exp = dense_exp and issparse(example)
|
||||
|
||||
if issparse(example) or (
|
||||
hasattr(example, "__array__")
|
||||
and np.asarray(example).ndim == 2
|
||||
and np.asarray(example).dtype.kind in "biuf"
|
||||
and np.asarray(example).shape[1] > 0
|
||||
):
|
||||
examples_sparse = [
|
||||
sparse_container(example)
|
||||
for sparse_container in (
|
||||
COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS
|
||||
)
|
||||
]
|
||||
for exmpl_sparse in examples_sparse:
|
||||
assert sparse_exp == is_multilabel(
|
||||
exmpl_sparse
|
||||
), f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}"
|
||||
|
||||
# Densify sparse examples before testing
|
||||
if issparse(example):
|
||||
example = example.toarray()
|
||||
|
||||
assert dense_exp == is_multilabel(
|
||||
example
|
||||
), f"is_multilabel({example!r}) should be {dense_exp}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
)
|
||||
def test_is_multilabel_array_api_compliance(array_namespace, device, dtype_name):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
|
||||
for group, group_examples in ARRAY_API_EXAMPLES.items():
|
||||
dense_exp = group == "multilabel-indicator"
|
||||
for example in group_examples:
|
||||
if np.asarray(example).dtype.kind == "f":
|
||||
example = np.asarray(example, dtype=dtype_name)
|
||||
else:
|
||||
example = np.asarray(example)
|
||||
example = xp.asarray(example, device=device)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
assert dense_exp == is_multilabel(
|
||||
example
|
||||
), f"is_multilabel({example!r}) should be {dense_exp}"
|
||||
|
||||
|
||||
def test_check_classification_targets():
|
||||
for y_type in EXAMPLES.keys():
|
||||
if y_type in ["unknown", "continuous", "continuous-multioutput"]:
|
||||
for example in EXAMPLES[y_type]:
|
||||
msg = "Unknown label type: "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
check_classification_targets(example)
|
||||
else:
|
||||
for example in EXAMPLES[y_type]:
|
||||
check_classification_targets(example)
|
||||
|
||||
|
||||
# @ignore_warnings
|
||||
def test_type_of_target():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
for example in group_examples:
|
||||
assert (
|
||||
type_of_target(example) == group
|
||||
), "type_of_target(%r) should be %r, got %r" % (
|
||||
example,
|
||||
group,
|
||||
type_of_target(example),
|
||||
)
|
||||
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
msg_regex = r"Expected array-like \(array or non-string sequence\).*"
|
||||
with pytest.raises(ValueError, match=msg_regex):
|
||||
type_of_target(example)
|
||||
|
||||
for example in MULTILABEL_SEQUENCES:
|
||||
msg = (
|
||||
"You appear to be using a legacy multi-label data "
|
||||
"representation. Sequence of sequences are no longer supported;"
|
||||
" use a binary array or sparse matrix instead."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(example)
|
||||
|
||||
|
||||
def test_type_of_target_pandas_sparse():
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan])
|
||||
msg = "y cannot be class 'SparseSeries' or 'SparseArray'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(y)
|
||||
|
||||
|
||||
def test_type_of_target_pandas_nullable():
|
||||
"""Check that type_of_target works with pandas nullable dtypes."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
for dtype in ["Int32", "Float32"]:
|
||||
y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype)
|
||||
assert type_of_target(y_true) == "multiclass"
|
||||
|
||||
y_true = pd.Series([1, 0, 1, 0], dtype=dtype)
|
||||
assert type_of_target(y_true) == "binary"
|
||||
|
||||
y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32")
|
||||
assert type_of_target(y_true) == "continuous-multioutput"
|
||||
|
||||
y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32")
|
||||
assert type_of_target(y_true) == "multilabel-indicator"
|
||||
|
||||
y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32")
|
||||
assert type_of_target(y_true) == "multiclass-multioutput"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_unique_labels_pandas_nullable(dtype):
|
||||
"""Checks that unique_labels work with pandas nullable dtypes.
|
||||
|
||||
Non-regression test for gh-25634.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
|
||||
y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64")
|
||||
|
||||
labels = unique_labels(y_true, y_predicted)
|
||||
assert_array_equal(labels, [0, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_class_distribution(csc_container):
|
||||
y = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[2, 2, 0, 1],
|
||||
[1, 3, 0, 1],
|
||||
[4, 2, 0, 1],
|
||||
[2, 0, 0, 1],
|
||||
[1, 3, 0, 1],
|
||||
]
|
||||
)
|
||||
# Define the sparse matrix with a mix of implicit and explicit zeros
|
||||
data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
|
||||
indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
|
||||
indptr = np.array([0, 6, 11, 11, 17])
|
||||
y_sp = csc_container((data, indices, indptr), shape=(6, 4))
|
||||
|
||||
classes, n_classes, class_prior = class_distribution(y)
|
||||
classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
|
||||
classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]]
|
||||
n_classes_expected = [3, 3, 1, 1]
|
||||
class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
# Test again with explicit sample weights
|
||||
(classes, n_classes, class_prior) = class_distribution(
|
||||
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
|
||||
)
|
||||
(classes_sp, n_classes_sp, class_prior_sp) = class_distribution(
|
||||
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
|
||||
)
|
||||
class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
|
||||
def test_safe_split_with_precomputed_kernel():
|
||||
clf = SVC()
|
||||
clfp = SVC(kernel="precomputed")
|
||||
|
||||
iris = datasets.load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
K = np.dot(X, X.T)
|
||||
|
||||
cv = ShuffleSplit(test_size=0.25, random_state=0)
|
||||
train, test = list(cv.split(X))[0]
|
||||
|
||||
X_train, y_train = _safe_split(clf, X, y, train)
|
||||
K_train, y_train2 = _safe_split(clfp, K, y, train)
|
||||
assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
|
||||
assert_array_almost_equal(y_train, y_train2)
|
||||
|
||||
X_test, y_test = _safe_split(clf, X, y, test, train)
|
||||
K_test, y_test2 = _safe_split(clfp, K, y, test, train)
|
||||
assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
|
||||
assert_array_almost_equal(y_test, y_test2)
|
||||
|
||||
|
||||
def test_ovr_decision_function():
|
||||
# test properties for ovr decision function
|
||||
|
||||
predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]])
|
||||
|
||||
confidences = np.array(
|
||||
[[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]]
|
||||
)
|
||||
|
||||
n_classes = 3
|
||||
|
||||
dec_values = _ovr_decision_function(predictions, confidences, n_classes)
|
||||
|
||||
# check that the decision values are within 0.5 range of the votes
|
||||
votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]])
|
||||
|
||||
assert_allclose(votes, dec_values, atol=0.5)
|
||||
|
||||
# check that the prediction are what we expect
|
||||
# highest vote or highest confidence if there is a tie.
|
||||
# for the second sample we have a tie (should be won by 1)
|
||||
expected_prediction = np.array([2, 1, 2, 2])
|
||||
assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction)
|
||||
|
||||
# third and fourth sample have the same vote but third sample
|
||||
# has higher confidence, this should reflect on the decision values
|
||||
assert dec_values[2, 2] > dec_values[3, 2]
|
||||
|
||||
# assert subset invariance.
|
||||
dec_values_one = [
|
||||
_ovr_decision_function(
|
||||
np.array([predictions[i]]), np.array([confidences[i]]), n_classes
|
||||
)[0]
|
||||
for i in range(4)
|
||||
]
|
||||
|
||||
assert_allclose(dec_values, dec_values_one, atol=1e-6)
|
||||
|
||||
|
||||
# TODO(1.7): Change to ValueError when byte labels is deprecated.
|
||||
@pytest.mark.parametrize("input_type", ["list", "array"])
|
||||
def test_labels_in_bytes_format(input_type):
|
||||
# check that we raise an error with bytes encoded labels
|
||||
# non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16980
|
||||
target = _convert_container([b"a", b"b"], input_type)
|
||||
err_msg = (
|
||||
"Support for labels represented as bytes is deprecated in v1.5 and will"
|
||||
" error in v1.7. Convert the labels to a string or integer format."
|
||||
)
|
||||
with pytest.warns(FutureWarning, match=err_msg):
|
||||
type_of_target(target)
|
||||
Reference in New Issue
Block a user