some new features
This commit is contained in:
204
.venv/lib/python3.12/site-packages/scipy/stats/_crosstab.py
Normal file
204
.venv/lib/python3.12/site-packages/scipy/stats/_crosstab.py
Normal file
@ -0,0 +1,204 @@
|
||||
import numpy as np
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy._lib._bunch import _make_tuple_bunch
|
||||
|
||||
|
||||
CrosstabResult = _make_tuple_bunch(
|
||||
"CrosstabResult", ["elements", "count"]
|
||||
)
|
||||
|
||||
|
||||
def crosstab(*args, levels=None, sparse=False):
|
||||
"""
|
||||
Return table of counts for each possible unique combination in ``*args``.
|
||||
|
||||
When ``len(args) > 1``, the array computed by this function is
|
||||
often referred to as a *contingency table* [1]_.
|
||||
|
||||
The arguments must be sequences with the same length. The second return
|
||||
value, `count`, is an integer array with ``len(args)`` dimensions. If
|
||||
`levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
|
||||
is the number of unique elements in ``args[k]``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
*args : sequences
|
||||
A sequence of sequences whose unique aligned elements are to be
|
||||
counted. The sequences in args must all be the same length.
|
||||
levels : sequence, optional
|
||||
If `levels` is given, it must be a sequence that is the same length as
|
||||
`args`. Each element in `levels` is either a sequence or None. If it
|
||||
is a sequence, it gives the values in the corresponding sequence in
|
||||
`args` that are to be counted. If any value in the sequences in `args`
|
||||
does not occur in the corresponding sequence in `levels`, that value
|
||||
is ignored and not counted in the returned array `count`. The default
|
||||
value of `levels` for ``args[i]`` is ``np.unique(args[i])``
|
||||
sparse : bool, optional
|
||||
If True, return a sparse matrix. The matrix will be an instance of
|
||||
the `scipy.sparse.coo_matrix` class. Because SciPy's sparse matrices
|
||||
must be 2-d, only two input sequences are allowed when `sparse` is
|
||||
True. Default is False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : CrosstabResult
|
||||
An object containing the following attributes:
|
||||
|
||||
elements : tuple of numpy.ndarrays.
|
||||
Tuple of length ``len(args)`` containing the arrays of elements
|
||||
that are counted in `count`. These can be interpreted as the
|
||||
labels of the corresponding dimensions of `count`. If `levels` was
|
||||
given, then if ``levels[i]`` is not None, ``elements[i]`` will
|
||||
hold the values given in ``levels[i]``.
|
||||
count : numpy.ndarray or scipy.sparse.coo_matrix
|
||||
Counts of the unique elements in ``zip(*args)``, stored in an
|
||||
array. Also known as a *contingency table* when ``len(args) > 1``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.unique
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats.contingency import crosstab
|
||||
|
||||
Given the lists `a` and `x`, create a contingency table that counts the
|
||||
frequencies of the corresponding pairs.
|
||||
|
||||
>>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
|
||||
>>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
|
||||
>>> res = crosstab(a, x)
|
||||
>>> avals, xvals = res.elements
|
||||
>>> avals
|
||||
array(['A', 'B'], dtype='<U1')
|
||||
>>> xvals
|
||||
array(['X', 'Y', 'Z'], dtype='<U1')
|
||||
>>> res.count
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
So `('A', 'X')` occurs twice, `('A', 'Y')` occurs three times, etc.
|
||||
|
||||
Higher dimensional contingency tables can be created.
|
||||
|
||||
>>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
|
||||
>>> res = crosstab(a, x, p)
|
||||
>>> res.count
|
||||
array([[[2, 0],
|
||||
[2, 1],
|
||||
[0, 0]],
|
||||
[[1, 0],
|
||||
[0, 0],
|
||||
[1, 3]]])
|
||||
>>> res.count.shape
|
||||
(2, 3, 2)
|
||||
|
||||
The values to be counted can be set by using the `levels` argument.
|
||||
It allows the elements of interest in each input sequence to be
|
||||
given explicitly instead finding the unique elements of the sequence.
|
||||
|
||||
For example, suppose one of the arguments is an array containing the
|
||||
answers to a survey question, with integer values 1 to 4. Even if the
|
||||
value 1 does not occur in the data, we want an entry for it in the table.
|
||||
|
||||
>>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur.
|
||||
>>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur.
|
||||
>>> options = [1, 2, 3, 4]
|
||||
>>> res = crosstab(q1, q2, levels=(options, options))
|
||||
>>> res.count
|
||||
array([[0, 0, 0, 0],
|
||||
[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If `levels` is given, but an element of `levels` is None, the unique values
|
||||
of the corresponding argument are used. For example,
|
||||
|
||||
>>> res = crosstab(q1, q2, levels=(None, options))
|
||||
>>> res.elements
|
||||
[array([2, 3, 4]), [1, 2, 3, 4]]
|
||||
>>> res.count
|
||||
array([[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If we want to ignore the pairs where 4 occurs in ``q2``, we can
|
||||
give just the values [1, 2] to `levels`, and the 4 will be ignored:
|
||||
|
||||
>>> res = crosstab(q1, q2, levels=(None, [1, 2]))
|
||||
>>> res.elements
|
||||
[array([2, 3, 4]), [1, 2]]
|
||||
>>> res.count
|
||||
array([[1, 1],
|
||||
[1, 4],
|
||||
[0, 3]])
|
||||
|
||||
Finally, let's repeat the first example, but return a sparse matrix:
|
||||
|
||||
>>> res = crosstab(a, x, sparse=True)
|
||||
>>> res.count
|
||||
<COOrdinate sparse matrix of dtype 'int64'
|
||||
with 4 stored elements and shape (2, 3)>
|
||||
>>> res.count.toarray()
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
"""
|
||||
nargs = len(args)
|
||||
if nargs == 0:
|
||||
raise TypeError("At least one input sequence is required.")
|
||||
|
||||
len0 = len(args[0])
|
||||
if not all(len(a) == len0 for a in args[1:]):
|
||||
raise ValueError("All input sequences must have the same length.")
|
||||
|
||||
if sparse and nargs != 2:
|
||||
raise ValueError("When `sparse` is True, only two input sequences "
|
||||
"are allowed.")
|
||||
|
||||
if levels is None:
|
||||
# Call np.unique with return_inverse=True on each argument.
|
||||
actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
|
||||
for a in args])
|
||||
else:
|
||||
# `levels` is not None...
|
||||
if len(levels) != nargs:
|
||||
raise ValueError('len(levels) must equal the number of input '
|
||||
'sequences')
|
||||
|
||||
args = [np.asarray(arg) for arg in args]
|
||||
mask = np.zeros((nargs, len0), dtype=np.bool_)
|
||||
inv = np.zeros((nargs, len0), dtype=np.intp)
|
||||
actual_levels = []
|
||||
for k, (levels_list, arg) in enumerate(zip(levels, args)):
|
||||
if levels_list is None:
|
||||
levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
|
||||
mask[k, :] = True
|
||||
else:
|
||||
q = arg == np.asarray(levels_list).reshape(-1, 1)
|
||||
mask[k, :] = np.any(q, axis=0)
|
||||
qnz = q.T.nonzero()
|
||||
inv[k, qnz[0]] = qnz[1]
|
||||
actual_levels.append(levels_list)
|
||||
|
||||
mask_all = mask.all(axis=0)
|
||||
indices = tuple(inv[:, mask_all])
|
||||
|
||||
if sparse:
|
||||
count = coo_matrix((np.ones(len(indices[0]), dtype=int),
|
||||
(indices[0], indices[1])))
|
||||
count.sum_duplicates()
|
||||
else:
|
||||
shape = [len(u) for u in actual_levels]
|
||||
count = np.zeros(shape, dtype=int)
|
||||
np.add.at(count, indices, 1)
|
||||
|
||||
return CrosstabResult(actual_levels, count)
|
||||
Reference in New Issue
Block a user