reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/statsmodels/sandbox/gam.py
+++ b/venv/lib/python3.11/site-packages/statsmodels/sandbox/gam.py
@ -0,0 +1,433 @@
+"""
+Generalized additive models
+
+
+
+Requirements for smoothers
+--------------------------
+
+smooth(y, weights=xxx) : ? no return ? alias for fit
+predict(x=None) : smoothed values, fittedvalues or for new exog
+df_fit() : degress of freedom of fit ?
+
+
+Notes
+-----
+- using PolySmoother works for AdditiveModel, and GAM with Poisson and Binomial
+- testfailure with Gamma, no other families tested
+- there is still an indeterminacy in the split up of the constant across
+  components (smoothers) and alpha, sum, i.e. constant, looks good.
+  - role of offset, that I have not tried to figure out yet
+
+Refactoring
+-----------
+currently result is attached to model instead of other way around
+split up Result in class for AdditiveModel and for GAM,
+subclass GLMResults, needs verification that result statistics are appropriate
+how much inheritance, double inheritance?
+renamings and cleanup
+interface to other smoothers, scipy splines
+
+basic unittests as support for refactoring exist, but we should have a test
+case for gamma and the others. Advantage of PolySmoother is that we can
+benchmark against the parametric GLM results.
+
+"""
+
+# JP:
+# changes: use PolySmoother instead of crashing bspline
+# TODO: check/catalogue required interface of a smoother
+# TODO: replace default smoother by corresponding function to initialize
+#       other smoothers
+# TODO: fix iteration, do not define class with iterator methods, use looping;
+#       add maximum iteration and other optional stop criteria
+# fixed some of the dimension problems in PolySmoother,
+#       now graph for example looks good
+# NOTE: example script is now in examples folder
+#update: I did some of the above, see module docstring
+
+import numpy as np
+
+from statsmodels.genmod import families
+from statsmodels.sandbox.nonparametric.smoothers import PolySmoother
+from statsmodels.genmod.generalized_linear_model import GLM
+from statsmodels.tools.sm_exceptions import IterationLimitWarning, iteration_limit_doc
+
+import warnings
+
+DEBUG = False
+
+def default_smoother(x, s_arg=None):
+    '''
+
+    '''
+#    _x = x.copy()
+#    _x.sort()
+    _x = np.sort(x)
+    n = x.shape[0]
+    # taken form smooth.spline in R
+
+    #if n < 50:
+    if n < 500:
+        nknots = n
+    else:
+        a1 = np.log(50) / np.log(2)
+        a2 = np.log(100) / np.log(2)
+        a3 = np.log(140) / np.log(2)
+        a4 = np.log(200) / np.log(2)
+        if n < 200:
+            nknots = 2**(a1 + (a2 - a1) * (n - 50)/150.)
+        elif n < 800:
+            nknots = 2**(a2 + (a3 - a2) * (n - 200)/600.)
+        elif n < 3200:
+            nknots = 2**(a3 + (a4 - a3) * (n - 800)/2400.)
+        else:
+            nknots = 200 + (n - 3200.)**0.2
+    knots = _x[np.linspace(0, n-1, nknots).astype(np.int32)]
+
+    #s = SmoothingSpline(knots, x=x.copy())
+    #when I set order=2, I get nans in the GAM prediction
+    if s_arg is None:
+        order = 3 #what about knots? need smoother *args or **kwds
+    else:
+        order = s_arg
+    s = PolySmoother(order, x=x.copy())  #TODO: change order, why copy?
+#    s.gram(d=2)
+#    s.target_df = 5
+    return s
+
+class Offset:
+
+    def __init__(self, fn, offset):
+        self.fn = fn
+        self.offset = offset
+
+    def __call__(self, *args, **kw):
+        return self.fn(*args, **kw) + self.offset
+
+class Results:
+
+    def __init__(self, Y, alpha, exog, smoothers, family, offset):
+        self.nobs, self.k_vars = exog.shape  #assumes exog is 2d
+        #weird: If I put the previous line after the definition of self.mu,
+        #    then the attributed do not get added
+        self.Y = Y
+        self.alpha = alpha
+        self.smoothers = smoothers
+        self.offset = offset
+        self.family = family
+        self.exog = exog
+        self.offset = offset
+        self.mu = self.linkinversepredict(exog)  #TODO: remove __call__
+
+
+
+    def __call__(self, exog):
+        '''expected value ? check new GLM, same as mu for given exog
+        maybe remove this
+        '''
+        return self.linkinversepredict(exog)
+
+    def linkinversepredict(self, exog):  #TODO what's the name in GLM
+        '''expected value ? check new GLM, same as mu for given exog
+        '''
+        return self.family.link.inverse(self.predict(exog))
+
+    def predict(self, exog):
+        '''predict response, sum of smoothed components
+        TODO: What's this in the case of GLM, corresponds to X*beta ?
+        '''
+        #note: sum is here over axis=0,
+        #TODO: transpose in smoothed and sum over axis=1
+
+        #BUG: there is some inconsistent orientation somewhere
+        #temporary hack, will not work for 1d
+        #print dir(self)
+        #print 'self.nobs, self.k_vars', self.nobs, self.k_vars
+        exog_smoothed = self.smoothed(exog)
+        #print 'exog_smoothed.shape', exog_smoothed.shape
+        if exog_smoothed.shape[0] == self.k_vars:
+            import warnings
+            warnings.warn("old orientation, colvars, will go away",
+                          FutureWarning)
+            return np.sum(self.smoothed(exog), axis=0) + self.alpha
+        if exog_smoothed.shape[1] == self.k_vars:
+            return np.sum(exog_smoothed, axis=1) + self.alpha
+        else:
+            raise ValueError('shape mismatch in predict')
+
+    def smoothed(self, exog):
+        '''get smoothed prediction for each component
+
+        '''
+        #bug: with exog in predict I get a shape error
+        #print 'smoothed', exog.shape, self.smoothers[0].predict(exog).shape
+        #there was a mistake exog did not have column index i
+        return np.array([self.smoothers[i].predict(exog[:,i]) + self.offset[i]
+        #should not be a mistake because exog[:,i] is attached to smoother, but
+        #it is for different exog
+        #return np.array([self.smoothers[i].predict() + self.offset[i]
+                         for i in range(exog.shape[1])]).T
+
+    def smoothed_demeaned(self, exog):
+        components = self.smoothed(exog)
+        means = components.mean(0)
+        constant = means.sum() + self.alpha
+        components_demeaned = components - means
+        return components_demeaned, constant
+
+class AdditiveModel:
+    '''additive model with non-parametric, smoothed components
+
+    Parameters
+    ----------
+    exog : ndarray
+    smoothers : None or list of smoother instances
+        smoother instances not yet checked
+    weights : None or ndarray
+    family : None or family instance
+        I think only used because of shared results with GAM and subclassing.
+        If None, then Gaussian is used.
+    '''
+
+    def __init__(self, exog, smoothers=None, weights=None, family=None):
+        self.exog = exog
+        if weights is not None:
+            self.weights = weights
+        else:
+            self.weights = np.ones(self.exog.shape[0])
+
+        self.smoothers = smoothers or [default_smoother(exog[:,i]) for i in range(exog.shape[1])]
+
+        #TODO: why do we set here df, refactoring temporary?
+        for i in range(exog.shape[1]):
+            self.smoothers[i].df = 10
+
+        if family is None:
+            self.family = families.Gaussian()
+        else:
+            self.family = family
+        #self.family = families.Gaussian()
+
+    def _iter__(self):
+        '''initialize iteration ?, should be removed
+
+        '''
+        self.iter = 0
+        self.dev = np.inf
+        return self
+
+    def next(self):
+        '''internal calculation for one fit iteration
+
+        BUG: I think this does not improve, what is supposed to improve
+            offset does not seem to be used, neither an old alpha
+            The smoothers keep coef/params from previous iteration
+        '''
+        _results = self.results
+        Y = self.results.Y
+        mu = _results.predict(self.exog)
+        #TODO offset is never used ?
+        offset = np.zeros(self.exog.shape[1], np.float64)
+        alpha = (Y * self.weights).sum() / self.weights.sum()
+        for i in range(self.exog.shape[1]):
+            tmp = self.smoothers[i].predict()
+            #TODO: check what smooth needs to do
+            #smooth (alias for fit, fit given x to new y and attach
+            #print 'next shape', (Y - alpha - mu + tmp).shape
+            bad = np.isnan(Y - alpha - mu + tmp).any()
+            if bad: #temporary assert while debugging
+                print(Y, alpha, mu, tmp)
+                raise ValueError("nan encountered")
+            #self.smoothers[i].smooth(Y - alpha - mu + tmp,
+            self.smoothers[i].smooth(Y - mu + tmp,
+                                     weights=self.weights)
+            tmp2 = self.smoothers[i].predict() #fittedvalues of previous smooth/fit
+            self.results.offset[i] = -(tmp2*self.weights).sum() / self.weights.sum()
+            #self.offset used in smoothed
+            if DEBUG:
+                print(self.smoothers[i].params)
+            mu += tmp2 - tmp
+        #change setting offset here: tests still pass, offset equal to constant
+        #in component ??? what's the effect of offset
+        offset = self.results.offset
+        #print self.iter
+        #self.iter += 1 #missing incrementing of iter counter NOT
+        return Results(Y, alpha, self.exog, self.smoothers, self.family, offset)
+
+    def cont(self):
+        '''condition to continue iteration loop
+
+        Parameters
+        ----------
+        tol
+
+        Returns
+        -------
+        cont : bool
+            If true, then iteration should be continued.
+
+        '''
+        self.iter += 1 #moved here to always count, not necessary
+        if DEBUG:
+            print(self.iter, self.results.Y.shape)
+            print(self.results.predict(self.exog).shape, self.weights.shape)
+        curdev = (((self.results.Y - self.results.predict(self.exog))**2) * self.weights).sum()
+
+        if self.iter > self.maxiter: #kill it, no max iterationoption
+            return False
+        if np.fabs((self.dev - curdev) / curdev) < self.rtol:
+            self.dev = curdev
+            return False
+
+        #self.iter += 1
+        self.dev = curdev
+        return True
+
+    def df_resid(self):
+        '''degrees of freedom of residuals, ddof is sum of all smoothers df
+        '''
+        return self.results.Y.shape[0] - np.array([self.smoothers[i].df_fit() for i in range(self.exog.shape[1])]).sum()
+
+    def estimate_scale(self):
+        '''estimate standard deviation of residuals
+        '''
+        #TODO: remove use of self.results.__call__
+        return ((self.results.Y - self.results(self.exog))**2).sum() / self.df_resid()
+
+    def fit(self, Y, rtol=1.0e-06, maxiter=30):
+        '''fit the model to a given endogenous variable Y
+
+        This needs to change for consistency with statsmodels
+
+        '''
+        self.rtol = rtol
+        self.maxiter = maxiter
+        #iter(self)  # what does this do? anything?
+        self._iter__()
+        mu = 0
+        alpha = (Y * self.weights).sum() / self.weights.sum()
+
+        offset = np.zeros(self.exog.shape[1], np.float64)
+
+        for i in range(self.exog.shape[1]):
+            self.smoothers[i].smooth(Y - alpha - mu,
+                                     weights=self.weights)
+            tmp = self.smoothers[i].predict()
+            offset[i] = (tmp * self.weights).sum() / self.weights.sum()
+            tmp -= tmp.sum()
+            mu += tmp
+
+        self.results = Results(Y, alpha, self.exog, self.smoothers, self.family, offset)
+
+        while self.cont():
+            self.results = self.next()
+
+        if self.iter >= self.maxiter:
+            warnings.warn(iteration_limit_doc, IterationLimitWarning)
+
+        return self.results
+
+class Model(GLM, AdditiveModel):
+#class Model(AdditiveModel):
+    #TODO: what does GLM do? Is it actually used ?
+    #only used in __init__, dropping it does not change results
+    #but where gets family attached now? - weird, it's Gaussian in this case now
+    #also where is the link defined?
+    #AdditiveModel overwrites family and sets it to Gaussian - corrected
+
+    #I think both GLM and AdditiveModel subclassing is only used in __init__
+
+    #niter = 2
+
+#    def __init__(self, exog, smoothers=None, family=family.Gaussian()):
+#        GLM.__init__(self, exog, family=family)
+#        AdditiveModel.__init__(self, exog, smoothers=smoothers)
+#        self.family = family
+    def __init__(self, endog, exog, smoothers=None, family=families.Gaussian()):
+        #self.family = family
+        #TODO: inconsistent super __init__
+        AdditiveModel.__init__(self, exog, smoothers=smoothers, family=family)
+        GLM.__init__(self, endog, exog, family=family)
+        assert self.family is family  #make sure we got the right family
+
+    def next(self):
+        _results = self.results
+        Y = _results.Y
+        if np.isnan(self.weights).all():
+            print("nanweights1")
+
+        _results.mu = self.family.link.inverse(_results.predict(self.exog))
+        #eta = _results.predict(self.exog)
+        #_results.mu = self.family.fitted(eta)
+        weights = self.family.weights(_results.mu)
+        if np.isnan(weights).all():
+            self.weights = weights
+            print("nanweights2")
+        self.weights = weights
+        if DEBUG:
+            print('deriv isnan', np.isnan(self.family.link.deriv(_results.mu)).any())
+
+        #Z = _results.predict(self.exog) + \
+        Z = _results.predict(self.exog) + \
+               self.family.link.deriv(_results.mu) * (Y - _results.mu) #- _results.alpha #?added alpha
+
+        m = AdditiveModel(self.exog, smoothers=self.smoothers,
+                          weights=self.weights, family=self.family)
+
+        #TODO: I do not know what the next two lines do, Z, Y ? which is endog?
+        #Y is original endog, Z is endog for the next step in the iterative solver
+
+        _results = m.fit(Z)
+        self.history.append([Z, _results.predict(self.exog)])
+        _results.Y = Y
+        _results.mu = self.family.link.inverse(_results.predict(self.exog))
+        self.iter += 1
+        self.results = _results
+
+        return _results
+
+    def estimate_scale(self, Y=None):
+        """
+        Return Pearson\'s X^2 estimate of scale.
+        """
+
+        if Y is None:
+            Y = self.Y
+        resid = Y - self.results.mu
+        return (np.power(resid, 2) / self.family.variance(self.results.mu)).sum() \
+                    / self.df_resid   #TODO check this
+                    #/ AdditiveModel.df_resid(self)  #what is the class doing here?
+
+
+    def fit(self, Y, rtol=1.0e-06, maxiter=30):
+
+        self.rtol = rtol
+        self.maxiter = maxiter
+
+        self.Y = np.asarray(Y, np.float64)
+
+        self.history = []
+
+        #iter(self)
+        self._iter__()
+
+        #TODO code duplication with next?
+        alpha = self.Y.mean()
+        mu0 = self.family.starting_mu(Y)
+        #Z = self.family.link(alpha) + self.family.link.deriv(alpha) * (Y - alpha)
+        Z = self.family.link(alpha) + self.family.link.deriv(alpha) * (Y - mu0)
+        m = AdditiveModel(self.exog, smoothers=self.smoothers, family=self.family)
+        self.results = m.fit(Z)
+        self.results.mu = self.family.link.inverse(self.results.predict(self.exog))
+        self.results.Y = Y
+
+        while self.cont():
+            self.results = self.next()
+            self.scale = self.results.scale = self.estimate_scale()
+
+        if self.iter >= self.maxiter:
+            import warnings
+            warnings.warn(iteration_limit_doc, IterationLimitWarning)
+
+        return self.results