reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/statsmodels/sandbox/regression/runmnl.py
+++ b/venv/lib/python3.11/site-packages/statsmodels/sandbox/regression/runmnl.py
@ -0,0 +1,371 @@
+'''conditional logit and nested conditional logit
+
+nested conditional logit is supposed to be the random utility version
+(RU2 and maybe RU1)
+
+References:
+-----------
+currently based on:
+Greene, Econometric Analysis, 5th edition and draft (?)
+Hess, Florian, 2002, Structural Choice analysis with nested logit models,
+    The Stats Journal 2(3) pp 227-252
+
+not yet used:
+Silberhorn Nadja, Yasemin Boztug, Lutz Hildebrandt, 2008, Estimation with the
+    nested logit model: specifications and software particularities,
+    OR Spectrum
+Koppelman, Frank S., and Chandra Bhat with technical support from Vaneet Sethi,
+    Sriram Subramanian, Vincent Bernardin and Jian Zhang, 2006,
+    A Self Instructing Course in Mode Choice Modeling: Multinomial and
+    Nested Logit Models
+
+Author: josef-pktd
+License: BSD (simplified)
+'''
+import numpy as np
+import numpy.lib.recfunctions as recf
+from scipy import optimize
+
+
+class TryCLogit:
+    '''
+    Conditional Logit, data handling test
+
+    Parameters
+    ----------
+
+    endog : array (nobs,nchoices)
+        dummy encoding of realized choices
+    exog_bychoices : list of arrays
+        explanatory variables, one array of exog for each choice. Variables
+        with common coefficients have to be first in each array
+    ncommon : int
+        number of explanatory variables with common coefficients
+
+    Notes
+    -----
+
+    Utility for choice j is given by
+
+        $V_j = X_j * beta + Z * gamma_j$
+
+    where X_j contains generic variables (terminology Hess) that have the same
+    coefficient across choices, and Z are variables, like individual-specific
+    variables that have different coefficients across variables.
+
+    If there are choice specific constants, then they should be contained in Z.
+    For identification, the constant of one choice should be dropped.
+
+
+    '''
+
+    def __init__(self, endog, exog_bychoices, ncommon):
+        self.endog = endog
+        self.exog_bychoices = exog_bychoices
+        self.ncommon = ncommon
+        self.nobs, self.nchoices = endog.shape
+        self.nchoices = len(exog_bychoices)
+
+        #TODO: rename beta to params and include inclusive values for nested CL
+        betaind = [exog_bychoices[ii].shape[1]-ncommon for ii in range(4)]
+        zi = np.r_[[ncommon], ncommon + np.array(betaind).cumsum()]
+        beta_indices = [np.r_[np.array([0, 1]),z[zi[ii]:zi[ii+1]]]
+                       for ii in range(len(zi)-1)]
+        self.beta_indices = beta_indices
+
+        #for testing only
+        beta = np.arange(7)
+        betaidx_bychoices = [beta[idx] for idx in beta_indices]
+
+
+    def xbetas(self, params):
+        '''these are the V_i
+        '''
+
+        res = np.empty((self.nobs, self.nchoices))
+        for choiceind in range(self.nchoices):
+            res[:,choiceind] = np.dot(self.exog_bychoices[choiceind],
+                                      params[self.beta_indices[choiceind]])
+        return res
+
+    def loglike(self, params):
+        #normalization ?
+        xb = self.xbetas(params)
+        expxb = np.exp(xb)
+        sumexpxb = expxb.sum(1)#[:,None]
+        probs = expxb/expxb.sum(1)[:,None]  #we do not really need this for all
+        loglike = (self.endog * np.log(probs)).sum(1)
+        #is this the same: YES
+        #self.logliketest = (self.endog * xb).sum(1) - np.log(sumexpxb)
+        #if self.endog where index then xb[self.endog]
+        return -loglike.sum()   #return sum for now not for each observation
+
+    def fit(self, start_params=None):
+        if start_params is None:
+            start_params = np.zeros(6)  # need better np.zeros(6)
+        return optimize.fmin(self.loglike, start_params, maxfun=10000)
+
+
+class TryNCLogit:
+    '''
+    Nested Conditional Logit (RUNMNL), data handling test
+
+    unfinished, does not do anything yet
+
+    '''
+
+    def __init__(self, endog, exog_bychoices, ncommon):
+        self.endog = endog
+        self.exog_bychoices = exog_bychoices
+        self.ncommon = ncommon
+        self.nobs, self.nchoices = endog.shape
+        self.nchoices = len(exog_bychoices)
+
+
+        #TODO rename beta to params and include inclusive values for nested CL
+        betaind = [exog_bychoices[ii].shape[1]-ncommon for ii in range(4)]
+        zi = np.r_[[ncommon], ncommon + np.array(betaind).cumsum()]
+        beta_indices = [np.r_[np.array([0, 1]),z[zi[ii]:zi[ii+1]]]
+                       for ii in range(len(zi)-1)]
+        self.beta_indices = beta_indices
+
+        #for testing only
+        beta = np.arange(7)
+        betaidx_bychoices = [beta[idx] for idx in beta_indices]
+
+
+    def xbetas(self, params):
+        '''these are the V_i
+        '''
+
+        res = np.empty((self.nobs, self.nchoices))
+        for choiceind in range(self.nchoices):
+            res[:,choiceind] = np.dot(self.exog_bychoices[choiceind],
+                                      params[self.beta_indices[choiceind]])
+        return res
+
+    def loglike_leafbranch(self, params, tau):
+        #normalization ?
+        #check/change naming for tau
+        xb = self.xbetas(params)
+        expxb = np.exp(xb/tau)
+        sumexpxb = expxb.sum(1)#[:,None]
+        logsumexpxb = np.log(sumexpxb)
+        #loglike = (self.endog * xb).sum(1) - logsumexpxb
+        probs = expxb/sumexpxb[:,None]
+        return probs, logsumexpxp  # noqa:F821  See GH#5756
+        #if self.endog where index then xb[self.endog]
+        #return -loglike.sum()   #return sum for now not for each observation
+
+    def loglike_branch(self, params, tau):
+        #not yet sure how to keep track of branches during walking of tree
+        ivs = []
+        for b in branches:  # noqa:F821  See GH#5756
+            probs, iv = self.loglike_leafbranch(params, tau)
+            ivs.append(iv)
+
+        #ivs = np.array(ivs)   #note ivs is (nobs,nbranchchoices)
+        ivs = np.column_stack(ivs) # this way ?
+        exptiv = np.exp(tau*ivs)
+        sumexptiv = exptiv.sum(1)
+        logsumexpxb = np.log(sumexpxb)  # noqa:F821  See GH#5756
+        probs = exptiv/sumexptiv[:,None]
+
+
+####### obsolete version to try out attaching data,
+####### new in treewalkerclass.py, copy new version to replace this
+####### problem with bzr I will disconnect history when copying
+testxb = 0 #global to class
+class RU2NMNL:
+    '''Nested Multinomial Logit with Random Utility 2 parameterization
+
+    '''
+
+    def __init__(self, endog, exog, tree, paramsind):
+        self.endog = endog
+        self.datadict = exog
+        self.tree = tree
+        self.paramsind = paramsind
+
+        self.branchsum = ''
+        self.probs = {}
+
+
+    def calc_prob(self, tree, keys=None):
+        '''walking a tree bottom-up based on dictionary
+        '''
+        endog = self.endog
+        datadict = self.datadict
+        paramsind = self.paramsind
+        branchsum = self.branchsum
+
+
+        if isinstance(tree, tuple):   #assumes leaves are int for choice index
+            name, subtree = tree
+            print(name, datadict[name])
+            print('subtree', subtree)
+            keys = []
+            if testxb:
+                branchsum = datadict[name]
+            else:
+                branchsum = name  #0
+            for b in subtree:
+                print(b)
+                #branchsum += branch2(b)
+                branchsum = branchsum + self.calc_prob(b, keys)
+            print('branchsum', branchsum, keys)
+            for k in keys:
+                self.probs[k] = self.probs[k] + ['*' + name + '-prob']
+
+        else:
+            keys.append(tree)
+            self.probs[tree] = [tree + '-prob' +
+                                '(%s)' % ', '.join(self.paramsind[tree])]
+            if testxb:
+                leavessum = sum(datadict[bi] for bi in tree)
+                print('final branch with', tree, ''.join(tree), leavessum) #sum(tree)
+                return leavessum  #sum(xb[tree])
+            else:
+                return ''.join(tree) #sum(tree)
+
+        print('working on branch', tree, branchsum)
+        return branchsum
+
+
+
+#Trying out ways to handle data
+#------------------------------
+
+#travel data from Greene
+dta = np.genfromtxt('TableF23-2.txt', skip_header=1,
+                    names='Mode   Ttme   Invc    Invt      GC     Hinc    PSize'.split())
+
+endog = dta['Mode'].reshape(-1,4).copy() #I do not want a view
+nobs, nchoices = endog.shape
+datafloat = dta.view(float).reshape(-1,7)
+exog = datafloat[:,1:].reshape(-1,6*nchoices).copy() #I do not want a view
+
+print(endog.sum(0))
+varnames = dta.dtype.names
+print(varnames[1:])
+modes = ['Air', 'Train', 'Bus', 'Car']
+print(exog.mean(0).reshape(nchoices, -1)) # Greene Table 23.23
+
+
+
+
+#try dummy encoding for individual-specific variables
+exog_choice_names = ['GC', 'Ttme']
+exog_choice = np.column_stack([dta[name] for name in exog_choice_names])
+exog_choice = exog_choice.reshape(-1,len(exog_choice_names)*nchoices)
+exog_choice = np.c_[endog, exog_choice] # add constant dummy
+
+exog_individual = dta['Hinc'][:,None]
+
+#exog2 = np.c_[exog_choice, exog_individual*endog]
+
+# we can also overwrite and select in original datafloat
+# e.g. Hinc*endog{choice)
+
+choice_index = np.arange(dta.shape[0]) % nchoices
+hinca = dta['Hinc']*(choice_index==0)
+dta2=recf.append_fields(dta, ['Hinca'],[hinca], usemask=False)
+
+
+#another version
+
+xi = []
+for ii in range(4):
+    xi.append(datafloat[choice_index==ii])
+
+#one more
+dta1 = recf.append_fields(dta, ['Const'],[np.ones(dta.shape[0])], usemask=False)
+
+xivar = [['GC', 'Ttme', 'Const', 'Hinc'],
+         ['GC', 'Ttme', 'Const'],
+         ['GC', 'Ttme', 'Const'],
+         ['GC', 'Ttme']]    #need to drop one constant
+
+xi = []
+for ii in range(4):
+    xi.append(dta1[xivar[ii]][choice_index==ii])
+    #this does not change sequence of columns, bug report by Skipper I think
+
+ncommon = 2
+betaind = [len(xi[ii].dtype.names)-ncommon for ii in range(4)]
+zi=np.r_[[ncommon], ncommon+np.array(betaind).cumsum()]
+z=np.arange(7)  #what is n?
+betaindices = [np.r_[np.array([0, 1]),z[zi[ii]:zi[ii+1]]]
+               for ii in range(len(zi)-1)]
+
+beta = np.arange(7)
+betai = [beta[idx] for idx in betaindices]
+
+
+
+
+#examples for TryCLogit
+#----------------------
+
+
+#get exogs as float
+xifloat = [xx.view(float).reshape(nobs,-1) for xx in xi]
+clogit = TryCLogit(endog, xifloat, 2)
+
+debug = 0
+if debug:
+    res = optimize.fmin(clogit.loglike, np.ones(6))
+#estimated parameters from Greene:
+tab2324 = [-0.15501, -0.09612, 0.01329, 5.2074, 3.8690, 3.1632]
+if debug:
+    res2 = optimize.fmin(clogit.loglike, tab2324)
+
+res3 = optimize.fmin(clogit.loglike, np.zeros(6),maxfun=10000)
+#this has same numbers as Greene table 23.24, but different sequence
+#coefficient on GC is exactly 10% of Greene's
+#TODO: get better starting values
+'''
+Optimization terminated successfully.
+         Current function value: 199.128369
+         Iterations: 957
+         Function evaluations: 1456
+array([-0.0961246 , -0.0155019 ,  0.01328757,  5.20741244,  3.86905293,
+        3.16319074])
+'''
+res3corr = res3[[1, 0, 2, 3, 4, 5]]
+res3corr[0] *= 10
+print(res3corr - tab2324)  # diff 1e-5 to 1e-6
+#199.128369 - 199.1284  #llf same up to print(precision of Greene
+
+print(clogit.fit())
+
+
+tree0 = ('top',
+            [('Fly',['Air']),
+             ('Ground', ['Train', 'Car', 'Bus'])
+             ])
+
+datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'],
+                    [xifloat[i]for i in range(4)]))
+
+#for testing only (mock that returns it's own name
+datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'],
+                    ['Airdata', 'Traindata', 'Busdata', 'Cardata']))
+
+datadict.update({'top' :   [],
+                 'Fly' :   [],
+                 'Ground': []})
+
+paramsind = {'top' :   [],
+             'Fly' :   [],
+             'Ground': [],
+             'Air' :   ['GC', 'Ttme', 'ConstA', 'Hinc'],
+             'Train' : ['GC', 'Ttme', 'ConstT'],
+             'Bus' :   ['GC', 'Ttme', 'ConstB'],
+             'Car' :   ['GC', 'Ttme']
+             }
+
+modru = RU2NMNL(endog, datadict, tree0, paramsind)
+print(modru.calc_prob(modru.tree))
+print('\nmodru.probs')
+print(modru.probs)