reconnect moved files to git repo
This commit is contained in:
136
venv/lib/python3.11/site-packages/sklearn/utils/murmurhash.pyx
Normal file
136
venv/lib/python3.11/site-packages/sklearn/utils/murmurhash.pyx
Normal file
@ -0,0 +1,136 @@
|
||||
"""Cython wrapper for MurmurHash3 non-cryptographic hash function.
|
||||
|
||||
MurmurHash is an extensively tested and very fast hash function that has
|
||||
good distribution properties suitable for machine learning use cases
|
||||
such as feature hashing and random projections.
|
||||
|
||||
The original C++ code by Austin Appleby is released the public domain
|
||||
and can be found here:
|
||||
|
||||
https://code.google.com/p/smhasher/
|
||||
|
||||
"""
|
||||
# Author: Olivier Grisel <olivier.grisel@ensta.org>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from ..utils._typedefs cimport int32_t, uint32_t
|
||||
|
||||
import numpy as np
|
||||
|
||||
cdef extern from "src/MurmurHash3.h":
|
||||
void MurmurHash3_x86_32(void *key, int len, uint32_t seed, void *out)
|
||||
void MurmurHash3_x86_128(void *key, int len, uint32_t seed, void *out)
|
||||
void MurmurHash3_x64_128 (void *key, int len, uint32_t seed, void *out)
|
||||
|
||||
|
||||
cpdef uint32_t murmurhash3_int_u32(int key, unsigned int seed):
|
||||
"""Compute the 32bit murmurhash3 of a int key at seed."""
|
||||
cdef uint32_t out
|
||||
MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
|
||||
return out
|
||||
|
||||
|
||||
cpdef int32_t murmurhash3_int_s32(int key, unsigned int seed):
|
||||
"""Compute the 32bit murmurhash3 of a int key at seed."""
|
||||
cdef int32_t out
|
||||
MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
|
||||
return out
|
||||
|
||||
|
||||
cpdef uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed):
|
||||
"""Compute the 32bit murmurhash3 of a bytes key at seed."""
|
||||
cdef uint32_t out
|
||||
MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
|
||||
return out
|
||||
|
||||
|
||||
cpdef int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed):
|
||||
"""Compute the 32bit murmurhash3 of a bytes key at seed."""
|
||||
cdef int32_t out
|
||||
MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
|
||||
return out
|
||||
|
||||
|
||||
def _murmurhash3_bytes_array_u32(
|
||||
const int32_t[:] key,
|
||||
unsigned int seed,
|
||||
):
|
||||
"""Compute 32bit murmurhash3 hashes of a key int array at seed."""
|
||||
# TODO make it possible to pass preallocated output array
|
||||
cdef:
|
||||
uint32_t[:] out = np.zeros(key.size, np.uint32)
|
||||
Py_ssize_t i
|
||||
for i in range(key.shape[0]):
|
||||
out[i] = murmurhash3_int_u32(key[i], seed)
|
||||
return np.asarray(out)
|
||||
|
||||
|
||||
def _murmurhash3_bytes_array_s32(
|
||||
const int32_t[:] key,
|
||||
unsigned int seed,
|
||||
):
|
||||
"""Compute 32bit murmurhash3 hashes of a key int array at seed."""
|
||||
# TODO make it possible to pass preallocated output array
|
||||
cdef:
|
||||
int32_t[:] out = np.zeros(key.size, np.int32)
|
||||
Py_ssize_t i
|
||||
for i in range(key.shape[0]):
|
||||
out[i] = murmurhash3_int_s32(key[i], seed)
|
||||
return np.asarray(out)
|
||||
|
||||
|
||||
def murmurhash3_32(key, seed=0, positive=False):
|
||||
"""Compute the 32bit murmurhash3 of key at seed.
|
||||
|
||||
The underlying implementation is MurmurHash3_x86_32 generating low
|
||||
latency 32bits hash suitable for implementing lookup tables, Bloom
|
||||
filters, count min sketch or feature hashing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : np.int32, bytes, unicode or ndarray of dtype=np.int32
|
||||
The physical object to hash.
|
||||
|
||||
seed : int, default=0
|
||||
Integer seed for the hashing algorithm.
|
||||
|
||||
positive : bool, default=False
|
||||
True: the results is casted to an unsigned int
|
||||
from 0 to 2 ** 32 - 1
|
||||
False: the results is casted to a signed int
|
||||
from -(2 ** 31) to 2 ** 31 - 1
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.utils import murmurhash3_32
|
||||
>>> murmurhash3_32(b"Hello World!", seed=42)
|
||||
3565178
|
||||
"""
|
||||
if isinstance(key, bytes):
|
||||
if positive:
|
||||
return murmurhash3_bytes_u32(key, seed)
|
||||
else:
|
||||
return murmurhash3_bytes_s32(key, seed)
|
||||
elif isinstance(key, unicode):
|
||||
if positive:
|
||||
return murmurhash3_bytes_u32(key.encode('utf-8'), seed)
|
||||
else:
|
||||
return murmurhash3_bytes_s32(key.encode('utf-8'), seed)
|
||||
elif isinstance(key, int) or isinstance(key, np.int32):
|
||||
if positive:
|
||||
return murmurhash3_int_u32(<int32_t>key, seed)
|
||||
else:
|
||||
return murmurhash3_int_s32(<int32_t>key, seed)
|
||||
elif isinstance(key, np.ndarray):
|
||||
if key.dtype != np.int32:
|
||||
raise TypeError(
|
||||
"key.dtype should be int32, got %s" % key.dtype)
|
||||
if positive:
|
||||
return _murmurhash3_bytes_array_u32(key.ravel(), seed).reshape(key.shape)
|
||||
else:
|
||||
return _murmurhash3_bytes_array_s32(key.ravel(), seed).reshape(key.shape)
|
||||
else:
|
||||
raise TypeError(
|
||||
"key %r with type %s is not supported. "
|
||||
"Explicit conversion to bytes is required" % (key, type(key)))
|
||||
Reference in New Issue
Block a user