reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/statsmodels/sandbox/nonparametric/kernels.py
+++ b/venv/lib/python3.11/site-packages/statsmodels/sandbox/nonparametric/kernels.py
@ -0,0 +1,576 @@
+"""
+This models contains the Kernels for Kernel smoothing.
+
+Hopefully in the future they may be reused/extended for other kernel based
+method
+
+References:
+----------
+
+Pointwise Kernel Confidence Bounds
+(smoothconf)
+http://fedc.wiwi.hu-berlin.de/xplore/ebooks/html/anr/anrhtmlframe62.html
+"""
+
+# pylint: disable-msg=C0103
+# pylint: disable-msg=W0142
+# pylint: disable-msg=E1101
+# pylint: disable-msg=E0611
+from statsmodels.compat.python import lzip, lfilter
+import numpy as np
+import scipy.integrate
+from scipy.special import factorial
+from numpy import exp, multiply, square, divide, subtract, inf
+
+
+class NdKernel:
+    """Generic N-dimensial kernel
+
+    Parameters
+    ----------
+    n : int
+        The number of series for kernel estimates
+    kernels : list
+        kernels
+
+    Can be constructed from either
+    a) a list of n kernels which will be treated as
+    indepent marginals on a gaussian copula (specified by H)
+    or b) a single univariate kernel which will be applied radially to the
+    mahalanobis distance defined by H.
+
+    In the case of the Gaussian these are both equivalent, and the second constructiong
+    is prefered.
+    """
+    def __init__(self, n, kernels = None, H = None):
+        if kernels is None:
+            kernels = Gaussian()
+
+        self._kernels = kernels
+        self.weights = None
+
+        if H is None:
+            H = np.matrix( np.identity(n))
+
+        self._H = H
+        self._Hrootinv = np.linalg.cholesky( H.I )
+
+    def getH(self):
+        """Getter for kernel bandwidth, H"""
+        return self._H
+
+    def setH(self, value):
+        """Setter for kernel bandwidth, H"""
+        self._H = value
+
+    H = property(getH, setH, doc="Kernel bandwidth matrix")
+
+    def density(self, xs, x):
+
+        n = len(xs)
+        #xs = self.in_domain( xs, xs, x )[0]
+
+        if len(xs)>0:  ## Need to do product of marginal distributions
+            #w = np.sum([self(self._Hrootinv * (xx-x).T ) for xx in xs])/n
+            #vectorized does not work:
+            if self.weights is not None:
+                w = np.mean(self((xs-x) * self._Hrootinv).T * self.weights)/sum(self.weights)
+            else:
+                w = np.mean(self((xs-x) * self._Hrootinv )) #transposed
+            #w = np.mean([self(xd) for xd in ((xs-x) * self._Hrootinv)] ) #transposed
+            return w
+        else:
+            return np.nan
+
+    def _kernweight(self, x ):
+        """returns the kernel weight for the independent multivariate kernel"""
+        if isinstance( self._kernels, CustomKernel ):
+            ## Radial case
+            #d = x.T * x
+            #x is matrix, 2d, element wise sqrt looks wrong
+            #d = np.sqrt( x.T * x )
+            x = np.asarray(x)
+            #d = np.sqrt( (x * x).sum(-1) )
+            d = (x * x).sum(-1)
+            return self._kernels( np.asarray(d) )
+
+    def __call__(self, x):
+        """
+        This simply returns the value of the kernel function at x
+
+        Does the same as weight if the function is normalised
+        """
+        return self._kernweight(x)
+
+
+class CustomKernel:
+    """
+    Generic 1D Kernel object.
+    Can be constructed by selecting a standard named Kernel,
+    or providing a lambda expression and domain.
+    The domain allows some algorithms to run faster for finite domain kernels.
+    """
+    # MC: Not sure how this will look in the end - or even still exist.
+    # Main purpose of this is to allow custom kernels and to allow speed up
+    # from finite support.
+
+    def __init__(self, shape, h = 1.0, domain = None, norm = None):
+        """
+        shape should be a function taking and returning numeric type.
+
+        For sanity it should always return positive or zero but this is not
+        enforced in case you want to do weird things. Bear in mind that the
+        statistical tests etc. may not be valid for non-positive kernels.
+
+        The bandwidth of the kernel is supplied as h.
+
+        You may specify a domain as a list of 2 values [min, max], in which case
+        kernel will be treated as zero outside these values. This will speed up
+        calculation.
+
+        You may also specify the normalisation constant for the supplied Kernel.
+        If you do this number will be stored and used as the normalisation
+        without calculation.  It is recommended you do this if you know the
+        constant, to speed up calculation.  In particular if the shape function
+        provided is already normalised you should provide norm = 1.0.
+
+        Warning: I think several calculations assume that the kernel is
+        normalized. No tests for non-normalized kernel.
+        """
+        self._normconst = norm   # a value or None, if None, then calculate
+        self.domain = domain
+        self.weights = None
+        if callable(shape):
+            self._shape = shape
+        else:
+            raise TypeError("shape must be a callable object/function")
+        self._h = h
+        self._L2Norm = None
+        self._kernel_var = None
+        self._normal_reference_constant = None
+        self._order = None
+
+    def geth(self):
+        """Getter for kernel bandwidth, h"""
+        return self._h
+    def seth(self, value):
+        """Setter for kernel bandwidth, h"""
+        self._h = value
+    h = property(geth, seth, doc="Kernel Bandwidth")
+
+    def in_domain(self, xs, ys, x):
+        """
+        Returns the filtered (xs, ys) based on the Kernel domain centred on x
+        """
+        # Disable black-list functions: filter used for speed instead of
+        # list-comprehension
+        # pylint: disable-msg=W0141
+        def isInDomain(xy):
+            """Used for filter to check if point is in the domain"""
+            u = (xy[0]-x)/self.h
+            return np.all((u >= self.domain[0]) & (u <= self.domain[1]))
+
+        if self.domain is None:
+            return (xs, ys)
+        else:
+            filtered = lfilter(isInDomain, lzip(xs, ys))
+            if len(filtered) > 0:
+                xs, ys = lzip(*filtered)
+                return (xs, ys)
+            else:
+                return ([], [])
+
+    def density(self, xs, x):
+        """Returns the kernel density estimate for point x based on x-values
+        xs
+        """
+        xs = np.asarray(xs)
+        n = len(xs) # before in_domain?
+        if self.weights is not None:
+            xs, weights = self.in_domain( xs, self.weights, x )
+        else:
+            xs = self.in_domain( xs, xs, x )[0]
+        xs = np.asarray(xs)
+        #print 'len(xs)', len(xs), x
+        if xs.ndim == 1:
+            xs = xs[:,None]
+        if len(xs)>0:
+            h = self.h
+            if self.weights is not None:
+                w = 1 / h * np.sum(self((xs-x)/h).T * weights, axis=1)
+            else:
+                w = 1. / (h * n) * np.sum(self((xs-x)/h), axis=0)
+            return w
+        else:
+            return np.nan
+
+    def density_var(self, density, nobs):
+        """approximate pointwise variance for kernel density
+
+        not verified
+
+        Parameters
+        ----------
+        density : array_lie
+            pdf of the kernel density
+        nobs : int
+            number of observations used in the KDE estimation
+
+        Returns
+        -------
+        kde_var : ndarray
+            estimated variance of the density estimate
+
+        Notes
+        -----
+        This uses the asymptotic normal approximation to the distribution of
+        the density estimate.
+        """
+        return np.asarray(density) * self.L2Norm / self.h / nobs
+
+    def density_confint(self, density, nobs, alpha=0.05):
+        """approximate pointwise confidence interval for kernel density
+
+        The confidence interval is centered at the estimated density and
+        ignores the bias of the density estimate.
+
+        not verified
+
+        Parameters
+        ----------
+        density : array_lie
+            pdf of the kernel density
+        nobs : int
+            number of observations used in the KDE estimation
+
+        Returns
+        -------
+        conf_int : ndarray
+            estimated confidence interval of the density estimate, lower bound
+            in first column and upper bound in second column
+
+        Notes
+        -----
+        This uses the asymptotic normal approximation to the distribution of
+        the density estimate. The lower bound can be negative for density
+        values close to zero.
+        """
+        from scipy import stats
+        crit = stats.norm.isf(alpha / 2.)
+        density = np.asarray(density)
+        half_width = crit * np.sqrt(self.density_var(density, nobs))
+        conf_int = np.column_stack((density - half_width, density + half_width))
+        return conf_int
+
+    def smooth(self, xs, ys, x):
+        """Returns the kernel smoothing estimate for point x based on x-values
+        xs and y-values ys.
+        Not expected to be called by the user.
+        """
+        xs, ys = self.in_domain(xs, ys, x)
+
+        if len(xs)>0:
+            w = np.sum(self((xs-x)/self.h))
+            #TODO: change the below to broadcasting when shape is sorted
+            v = np.sum([yy*self((xx-x)/self.h) for xx, yy in zip(xs, ys)])
+            return v / w
+        else:
+            return np.nan
+
+    def smoothvar(self, xs, ys, x):
+        """Returns the kernel smoothing estimate of the variance at point x.
+        """
+        xs, ys = self.in_domain(xs, ys, x)
+
+        if len(xs) > 0:
+            fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
+            sqresid = square( subtract(ys, fittedvals) )
+            w = np.sum(self((xs-x)/self.h))
+            v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
+            return v / w
+        else:
+            return np.nan
+
+    def smoothconf(self, xs, ys, x, alpha=0.05):
+        """Returns the kernel smoothing estimate with confidence 1sigma bounds
+        """
+        xs, ys = self.in_domain(xs, ys, x)
+
+        if len(xs) > 0:
+            fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
+            #fittedvals = self.smooth(xs, ys, x) # x or xs in Haerdle
+            sqresid = square(
+                subtract(ys, fittedvals)
+            )
+            w = np.sum(self((xs-x)/self.h))
+            #var = sqresid.sum() / (len(sqresid) - 0)  # nonlocal var ? JP just trying
+            v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
+            var = v / w
+            sd = np.sqrt(var)
+            K = self.L2Norm
+            yhat = self.smooth(xs, ys, x)
+            from scipy import stats
+            crit = stats.norm.isf(alpha / 2)
+            err = crit * sd * np.sqrt(K) / np.sqrt(w * self.h * self.norm_const)
+            return (yhat - err, yhat, yhat + err)
+        else:
+            return (np.nan, np.nan, np.nan)
+
+    @property
+    def L2Norm(self):
+        """Returns the integral of the square of the kernal from -inf to inf"""
+        if self._L2Norm is None:
+            L2Func = lambda x: (self.norm_const*self._shape(x))**2
+            if self.domain is None:
+                self._L2Norm = scipy.integrate.quad(L2Func, -inf, inf)[0]
+            else:
+                self._L2Norm = scipy.integrate.quad(L2Func, self.domain[0],
+                                               self.domain[1])[0]
+        return self._L2Norm
+
+    @property
+    def norm_const(self):
+        """
+        Normalising constant for kernel (integral from -inf to inf)
+        """
+        if self._normconst is None:
+            if self.domain is None:
+                quadres = scipy.integrate.quad(self._shape, -inf, inf)
+            else:
+                quadres = scipy.integrate.quad(self._shape, self.domain[0],
+                                               self.domain[1])
+            self._normconst = 1.0/(quadres[0])
+        return self._normconst
+
+    @property
+    def kernel_var(self):
+        """Returns the second moment of the kernel"""
+        if self._kernel_var is None:
+            func = lambda x: x**2 * self.norm_const * self._shape(x)
+            if self.domain is None:
+                self._kernel_var = scipy.integrate.quad(func, -inf, inf)[0]
+            else:
+                self._kernel_var = scipy.integrate.quad(func, self.domain[0],
+                                               self.domain[1])[0]
+        return self._kernel_var
+
+    def moments(self, n):
+
+        if n > 2:
+            msg = "Only first and second moment currently implemented"
+            raise NotImplementedError(msg)
+
+        if n == 1:
+            return 0
+
+        if n == 2:
+            return self.kernel_var
+
+    @property
+    def normal_reference_constant(self):
+        """
+        Constant used for silverman normal reference asymtotic bandwidth
+        calculation.
+
+        C  = 2((pi^(1/2)*(nu!)^3 R(k))/(2nu(2nu)!kap_nu(k)^2))^(1/(2nu+1))
+        nu = kernel order
+        kap_nu = nu'th moment of kernel
+        R = kernel roughness (square of L^2 norm)
+
+        Note: L2Norm property returns square of norm.
+        """
+        nu = self._order
+
+        if not nu == 2:
+            msg = "Only implemented for second order kernels"
+            raise NotImplementedError(msg)
+
+        if self._normal_reference_constant is None:
+            C = np.pi**(.5) * factorial(nu)**3 * self.L2Norm
+            C /= (2 * nu * factorial(2 * nu) * self.moments(nu)**2)
+            C = 2*C**(1.0/(2*nu+1))
+            self._normal_reference_constant = C
+
+        return self._normal_reference_constant
+
+
+    def weight(self, x):
+        """This returns the normalised weight at distance x"""
+        return self.norm_const*self._shape(x)
+
+    def __call__(self, x):
+        """
+        This simply returns the value of the kernel function at x
+
+        Does the same as weight if the function is normalised
+        """
+        return self._shape(x)
+
+
+class Uniform(CustomKernel):
+    def __init__(self, h=1.0):
+        CustomKernel.__init__(self, shape=lambda x: 0.5 * np.ones(x.shape), h=h,
+                              domain=[-1.0, 1.0], norm = 1.0)
+        self._L2Norm = 0.5
+        self._kernel_var = 1. / 3
+        self._order = 2
+
+
+class Triangular(CustomKernel):
+    def __init__(self, h=1.0):
+        CustomKernel.__init__(self, shape=lambda x: 1 - abs(x), h=h,
+                              domain=[-1.0, 1.0], norm = 1.0)
+        self._L2Norm = 2.0/3.0
+        self._kernel_var = 1. / 6
+        self._order = 2
+
+
+class Epanechnikov(CustomKernel):
+    def __init__(self, h=1.0):
+        CustomKernel.__init__(self, shape=lambda x: 0.75*(1 - x*x), h=h,
+                              domain=[-1.0, 1.0], norm = 1.0)
+        self._L2Norm = 0.6
+        self._kernel_var = 0.2
+        self._order = 2
+
+
+class Biweight(CustomKernel):
+    def __init__(self, h=1.0):
+        CustomKernel.__init__(self, shape=lambda x: 0.9375*(1 - x*x)**2, h=h,
+                              domain=[-1.0, 1.0], norm = 1.0)
+        self._L2Norm = 5.0/7.0
+        self._kernel_var = 1. / 7
+        self._order = 2
+
+    def smooth(self, xs, ys, x):
+        """Returns the kernel smoothing estimate for point x based on x-values
+        xs and y-values ys.
+        Not expected to be called by the user.
+
+        Special implementation optimized for Biweight.
+        """
+        xs, ys = self.in_domain(xs, ys, x)
+
+        if len(xs) > 0:
+            w = np.sum(square(subtract(1, square(divide(subtract(xs, x),
+                                                        self.h)))))
+            v = np.sum(multiply(ys, square(subtract(1, square(divide(
+                                                subtract(xs, x), self.h))))))
+            return v / w
+        else:
+            return np.nan
+
+    def smoothvar(self, xs, ys, x):
+        """
+        Returns the kernel smoothing estimate of the variance at point x.
+        """
+        xs, ys = self.in_domain(xs, ys, x)
+
+        if len(xs) > 0:
+            fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
+            rs = square(subtract(ys, fittedvals))
+            w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
+                                                        self.h)))))
+            v = np.sum(multiply(rs, square(subtract(1, square(divide(
+                                                subtract(xs, x), self.h))))))
+            return v / w
+        else:
+            return np.nan
+
+    def smoothconf_(self, xs, ys, x):
+        """Returns the kernel smoothing estimate with confidence 1sigma bounds
+        """
+        xs, ys = self.in_domain(xs, ys, x)
+
+        if len(xs) > 0:
+            fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
+            rs = square(subtract(ys, fittedvals))
+            w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
+                                                        self.h)))))
+            v = np.sum(multiply(rs, square(subtract(1, square(divide(
+                                                subtract(xs, x), self.h))))))
+            var = v / w
+            sd = np.sqrt(var)
+            K = self.L2Norm
+            yhat = self.smooth(xs, ys, x)
+            err = sd * K / np.sqrt(0.9375 * w * self.h)
+            return (yhat - err, yhat, yhat + err)
+        else:
+            return (np.nan, np.nan, np.nan)
+
+class Triweight(CustomKernel):
+    def __init__(self, h=1.0):
+        CustomKernel.__init__(self, shape=lambda x: 1.09375*(1 - x*x)**3, h=h,
+                              domain=[-1.0, 1.0], norm = 1.0)
+        self._L2Norm = 350.0/429.0
+        self._kernel_var = 1. / 9
+        self._order = 2
+
+
+class Gaussian(CustomKernel):
+    """
+    Gaussian (Normal) Kernel
+
+    K(u) = 1 / (sqrt(2*pi)) exp(-0.5 u**2)
+    """
+    def __init__(self, h=1.0):
+        CustomKernel.__init__(self, shape = lambda x: 0.3989422804014327 *
+                        np.exp(-x**2/2.0), h = h, domain = None, norm = 1.0)
+        self._L2Norm = 1.0/(2.0*np.sqrt(np.pi))
+        self._kernel_var = 1.0
+        self._order = 2
+
+    def smooth(self, xs, ys, x):
+        """Returns the kernel smoothing estimate for point x based on x-values
+        xs and y-values ys.
+        Not expected to be called by the user.
+
+        Special implementation optimized for Gaussian.
+        """
+        w = np.sum(exp(multiply(square(divide(subtract(xs, x),
+                                              self.h)),-0.5)))
+        v = np.sum(multiply(ys, exp(multiply(square(divide(subtract(xs, x),
+                                                          self.h)), -0.5))))
+        return v/w
+
+class Cosine(CustomKernel):
+    """
+    Cosine Kernel
+
+    K(u) = pi/4 cos(0.5 * pi * u) between -1.0 and 1.0
+    """
+    def __init__(self, h=1.0):
+        CustomKernel.__init__(self, shape=lambda x: 0.78539816339744828 *
+                np.cos(np.pi/2.0 * x), h=h, domain=[-1.0, 1.0], norm = 1.0)
+        self._L2Norm = np.pi**2/16.0
+        self._kernel_var = 0.1894305308612978 # = 1 - 8 / np.pi**2
+        self._order = 2
+
+
+class Cosine2(CustomKernel):
+    """
+    Cosine2 Kernel
+
+    K(u) = 1 + cos(2 * pi * u) between -0.5 and 0.5
+
+    Note: this  is the same Cosine kernel that Stata uses
+    """
+    def __init__(self, h=1.0):
+        CustomKernel.__init__(self, shape=lambda x: 1 + np.cos(2.0 * np.pi * x)
+                , h=h, domain=[-0.5, 0.5], norm = 1.0)
+        self._L2Norm = 1.5
+        self._kernel_var = 0.03267274151216444  # = 1/12. - 0.5 / np.pi**2
+        self._order = 2
+
+class Tricube(CustomKernel):
+    """
+    Tricube Kernel
+
+    K(u) = 0.864197530864 * (1 - abs(x)**3)**3 between -1.0 and 1.0
+    """
+    def __init__(self,h=1.0):
+        CustomKernel.__init__(self,shape=lambda x: 0.864197530864 * (1 - abs(x)**3)**3,
+                              h=h, domain=[-1.0, 1.0], norm = 1.0)
+        self._L2Norm = 175.0/247.0
+        self._kernel_var = 35.0/243.0
+        self._order = 2