some new features
This commit is contained in:
@ -0,0 +1,197 @@
|
||||
# Author: Lars Buitinck
|
||||
# License: BSD 3 clause
|
||||
|
||||
from itertools import chain
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin, _fit_context
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ._hashing_fast import transform as _hashing_transform
|
||||
|
||||
|
||||
def _iteritems(d):
|
||||
"""Like d.iteritems, but accepts any collections.Mapping."""
|
||||
return d.iteritems() if hasattr(d, "iteritems") else d.items()
|
||||
|
||||
|
||||
class FeatureHasher(TransformerMixin, BaseEstimator):
|
||||
"""Implements feature hashing, aka the hashing trick.
|
||||
|
||||
This class turns sequences of symbolic feature names (strings) into
|
||||
scipy.sparse matrices, using a hash function to compute the matrix column
|
||||
corresponding to a name. The hash function employed is the signed 32-bit
|
||||
version of Murmurhash3.
|
||||
|
||||
Feature names of type byte string are used as-is. Unicode strings are
|
||||
converted to UTF-8 first, but no Unicode normalization is done.
|
||||
Feature values must be (finite) numbers.
|
||||
|
||||
This class is a low-memory alternative to DictVectorizer and
|
||||
CountVectorizer, intended for large-scale (online) learning and situations
|
||||
where memory is tight, e.g. when running prediction code on embedded
|
||||
devices.
|
||||
|
||||
For an efficiency comparison of the different feature extractors, see
|
||||
:ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
|
||||
|
||||
Read more in the :ref:`User Guide <feature_hashing>`.
|
||||
|
||||
.. versionadded:: 0.13
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_features : int, default=2**20
|
||||
The number of features (columns) in the output matrices. Small numbers
|
||||
of features are likely to cause hash collisions, but large numbers
|
||||
will cause larger coefficient dimensions in linear learners.
|
||||
input_type : str, default='dict'
|
||||
Choose a string from {'dict', 'pair', 'string'}.
|
||||
Either "dict" (the default) to accept dictionaries over
|
||||
(feature_name, value); "pair" to accept pairs of (feature_name, value);
|
||||
or "string" to accept single strings.
|
||||
feature_name should be a string, while value should be a number.
|
||||
In the case of "string", a value of 1 is implied.
|
||||
The feature_name is hashed to find the appropriate column for the
|
||||
feature. The value's sign might be flipped in the output (but see
|
||||
non_negative, below).
|
||||
dtype : numpy dtype, default=np.float64
|
||||
The type of feature values. Passed to scipy.sparse matrix constructors
|
||||
as the dtype argument. Do not set this to bool, np.boolean or any
|
||||
unsigned integer type.
|
||||
alternate_sign : bool, default=True
|
||||
When True, an alternating sign is added to the features as to
|
||||
approximately conserve the inner product in the hashed space even for
|
||||
small n_features. This approach is similar to sparse random projection.
|
||||
|
||||
.. versionchanged:: 0.19
|
||||
``alternate_sign`` replaces the now deprecated ``non_negative``
|
||||
parameter.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DictVectorizer : Vectorizes string-valued features using a hash table.
|
||||
sklearn.preprocessing.OneHotEncoder : Handles nominal/categorical features.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This estimator is :term:`stateless` and does not need to be fitted.
|
||||
However, we recommend to call :meth:`fit_transform` instead of
|
||||
:meth:`transform`, as parameter validation is only performed in
|
||||
:meth:`fit`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.feature_extraction import FeatureHasher
|
||||
>>> h = FeatureHasher(n_features=10)
|
||||
>>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
|
||||
>>> f = h.transform(D)
|
||||
>>> f.toarray()
|
||||
array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.],
|
||||
[ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]])
|
||||
|
||||
With `input_type="string"`, the input must be an iterable over iterables of
|
||||
strings:
|
||||
|
||||
>>> h = FeatureHasher(n_features=8, input_type="string")
|
||||
>>> raw_X = [["dog", "cat", "snake"], ["snake", "dog"], ["cat", "bird"]]
|
||||
>>> f = h.transform(raw_X)
|
||||
>>> f.toarray()
|
||||
array([[ 0., 0., 0., -1., 0., -1., 0., 1.],
|
||||
[ 0., 0., 0., -1., 0., -1., 0., 0.],
|
||||
[ 0., -1., 0., 0., 0., 0., 0., 1.]])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="both")],
|
||||
"input_type": [StrOptions({"dict", "pair", "string"})],
|
||||
"dtype": "no_validation", # delegate to numpy
|
||||
"alternate_sign": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_features=(2**20),
|
||||
*,
|
||||
input_type="dict",
|
||||
dtype=np.float64,
|
||||
alternate_sign=True,
|
||||
):
|
||||
self.dtype = dtype
|
||||
self.input_type = input_type
|
||||
self.n_features = n_features
|
||||
self.alternate_sign = alternate_sign
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X=None, y=None):
|
||||
"""Only validates estimator's parameters.
|
||||
|
||||
This method allows to: (i) validate the estimator's parameters and
|
||||
(ii) be consistent with the scikit-learn transformer API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
FeatureHasher class instance.
|
||||
"""
|
||||
return self
|
||||
|
||||
def transform(self, raw_X):
|
||||
"""Transform a sequence of instances to a scipy.sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_X : iterable over iterable over raw features, length = n_samples
|
||||
Samples. Each sample must be iterable an (e.g., a list or tuple)
|
||||
containing/generating feature names (and optionally values, see
|
||||
the input_type constructor argument) which will be hashed.
|
||||
raw_X need not support the len function, so it can be the result
|
||||
of a generator; n_samples is determined on the fly.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Feature matrix, for use with estimators or further transformers.
|
||||
"""
|
||||
raw_X = iter(raw_X)
|
||||
if self.input_type == "dict":
|
||||
raw_X = (_iteritems(d) for d in raw_X)
|
||||
elif self.input_type == "string":
|
||||
first_raw_X = next(raw_X)
|
||||
if isinstance(first_raw_X, str):
|
||||
raise ValueError(
|
||||
"Samples can not be a single string. The input must be an iterable"
|
||||
" over iterables of strings."
|
||||
)
|
||||
raw_X_ = chain([first_raw_X], raw_X)
|
||||
raw_X = (((f, 1) for f in x) for x in raw_X_)
|
||||
|
||||
indices, indptr, values = _hashing_transform(
|
||||
raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0
|
||||
)
|
||||
n_samples = indptr.shape[0] - 1
|
||||
|
||||
if n_samples == 0:
|
||||
raise ValueError("Cannot vectorize empty sequence.")
|
||||
|
||||
X = sp.csr_matrix(
|
||||
(values, indices, indptr),
|
||||
dtype=self.dtype,
|
||||
shape=(n_samples, self.n_features),
|
||||
)
|
||||
X.sum_duplicates() # also sorts the indices
|
||||
|
||||
return X
|
||||
|
||||
def _more_tags(self):
|
||||
return {"X_types": [self.input_type]}
|
||||
Reference in New Issue
Block a user