reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/statsmodels/sandbox/tools/cross_val.py
+++ b/venv/lib/python3.11/site-packages/statsmodels/sandbox/tools/cross_val.py
@ -0,0 +1,367 @@
+"""
+Utilities for cross validation.
+
+taken from scikits.learn
+
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
+#         Gael Varoquaux    <gael.varoquaux@normalesup.org>
+# License: BSD Style.
+# $Id$
+
+changes to code by josef-pktd:
+ - docstring formatting: underlines of headers
+
+"""
+
+from statsmodels.compat.python import lrange
+import numpy as np
+from itertools import combinations
+
+
+################################################################################
+class LeaveOneOut:
+    """
+    Leave-One-Out cross validation iterator:
+    Provides train/test indexes to split data in train test sets
+    """
+
+    def __init__(self, n):
+        """
+        Leave-One-Out cross validation iterator:
+        Provides train/test indexes to split data in train test sets
+
+        Parameters
+        ----------
+        n: int
+            Total number of elements
+
+        Examples
+        --------
+        >>> from scikits.learn import cross_val
+        >>> X = [[1, 2], [3, 4]]
+        >>> y = [1, 2]
+        >>> loo = cross_val.LeaveOneOut(2)
+        >>> for train_index, test_index in loo:
+        ...    print "TRAIN:", train_index, "TEST:", test_index
+        ...    X_train, X_test, y_train, y_test = cross_val.split(train_index, test_index, X, y)
+        ...    print X_train, X_test, y_train, y_test
+        TRAIN: [False  True] TEST: [ True False]
+        [[3 4]] [[1 2]] [2] [1]
+        TRAIN: [ True False] TEST: [False  True]
+        [[1 2]] [[3 4]] [1] [2]
+        """
+        self.n = n
+
+
+    def __iter__(self):
+        n = self.n
+        for i in range(n):
+            test_index  = np.zeros(n, dtype=bool)
+            test_index[i] = True
+            train_index = np.logical_not(test_index)
+            yield train_index, test_index
+
+
+    def __repr__(self):
+        return '%s.%s(n=%i)' % (self.__class__.__module__,
+                                self.__class__.__name__,
+                                self.n,
+                                )
+
+
+
+################################################################################
+class LeavePOut:
+    """
+    Leave-P-Out cross validation iterator:
+    Provides train/test indexes to split data in train test sets
+    """
+
+    def __init__(self, n, p):
+        """
+        Leave-P-Out cross validation iterator:
+        Provides train/test indexes to split data in train test sets
+
+        Parameters
+        ----------
+        n: int
+            Total number of elements
+        p: int
+            Size test sets
+
+        Examples
+        --------
+        >>> from scikits.learn import cross_val
+        >>> X = [[1, 2], [3, 4], [5, 6], [7, 8]]
+        >>> y = [1, 2, 3, 4]
+        >>> lpo = cross_val.LeavePOut(4, 2)
+        >>> for train_index, test_index in lpo:
+        ...    print "TRAIN:", train_index, "TEST:", test_index
+        ...    X_train, X_test, y_train, y_test = cross_val.split(train_index, test_index, X, y)
+        TRAIN: [False False  True  True] TEST: [ True  True False False]
+        TRAIN: [False  True False  True] TEST: [ True False  True False]
+        TRAIN: [False  True  True False] TEST: [ True False False  True]
+        TRAIN: [ True False False  True] TEST: [False  True  True False]
+        TRAIN: [ True False  True False] TEST: [False  True False  True]
+        TRAIN: [ True  True False False] TEST: [False False  True  True]
+        """
+        self.n = n
+        self.p = p
+
+
+    def __iter__(self):
+        n = self.n
+        p = self.p
+        comb = combinations(lrange(n), p)
+        for idx in comb:
+            test_index = np.zeros(n, dtype=bool)
+            test_index[np.array(idx)] = True
+            train_index = np.logical_not(test_index)
+            yield train_index, test_index
+
+
+    def __repr__(self):
+        return '%s.%s(n=%i, p=%i)' % (
+                                self.__class__.__module__,
+                                self.__class__.__name__,
+                                self.n,
+                                self.p,
+                                )
+
+
+################################################################################
+class KFold:
+    """
+    K-Folds cross validation iterator:
+    Provides train/test indexes to split data in train test sets
+    """
+
+    def __init__(self, n, k):
+        """
+        K-Folds cross validation iterator:
+        Provides train/test indexes to split data in train test sets
+
+        Parameters
+        ----------
+        n: int
+            Total number of elements
+        k: int
+            number of folds
+
+        Examples
+        --------
+        >>> from scikits.learn import cross_val
+        >>> X = [[1, 2], [3, 4], [1, 2], [3, 4]]
+        >>> y = [1, 2, 3, 4]
+        >>> kf = cross_val.KFold(4, k=2)
+        >>> for train_index, test_index in kf:
+        ...    print "TRAIN:", train_index, "TEST:", test_index
+        ...    X_train, X_test, y_train, y_test = cross_val.split(train_index, test_index, X, y)
+        TRAIN: [False False  True  True] TEST: [ True  True False False]
+        TRAIN: [ True  True False False] TEST: [False False  True  True]
+
+        Notes
+        -----
+        All the folds have size trunc(n/k), the last one has the complementary
+        """
+        assert k>0, ValueError('cannot have k below 1')
+        assert k<n, ValueError('cannot have k=%d greater than %d'% (k, n))
+        self.n = n
+        self.k = k
+
+
+    def __iter__(self):
+        n = self.n
+        k = self.k
+        j = int(np.ceil(n/k))
+
+        for i in range(k):
+            test_index  = np.zeros(n, dtype=bool)
+            if i<k-1:
+                test_index[i*j:(i+1)*j] = True
+            else:
+                test_index[i*j:] = True
+            train_index = np.logical_not(test_index)
+            yield train_index, test_index
+
+
+    def __repr__(self):
+        return '%s.%s(n=%i, k=%i)' % (
+                                self.__class__.__module__,
+                                self.__class__.__name__,
+                                self.n,
+                                self.k,
+                                )
+
+
+################################################################################
+class LeaveOneLabelOut:
+    """
+    Leave-One-Label_Out cross-validation iterator:
+    Provides train/test indexes to split data in train test sets
+    """
+
+    def __init__(self, labels):
+        """
+        Leave-One-Label_Out cross validation:
+        Provides train/test indexes to split data in train test sets
+
+        Parameters
+        ----------
+        labels : list
+                List of labels
+
+        Examples
+        --------
+        >>> from scikits.learn import cross_val
+        >>> X = [[1, 2], [3, 4], [5, 6], [7, 8]]
+        >>> y = [1, 2, 1, 2]
+        >>> labels = [1, 1, 2, 2]
+        >>> lol = cross_val.LeaveOneLabelOut(labels)
+        >>> for train_index, test_index in lol:
+        ...    print "TRAIN:", train_index, "TEST:", test_index
+        ...    X_train, X_test, y_train, y_test = cross_val.split(train_index, \
+            test_index, X, y)
+        ...    print X_train, X_test, y_train, y_test
+        TRAIN: [False False  True  True] TEST: [ True  True False False]
+        [[5 6]
+        [7 8]] [[1 2]
+        [3 4]] [1 2] [1 2]
+        TRAIN: [ True  True False False] TEST: [False False  True  True]
+        [[1 2]
+        [3 4]] [[5 6]
+        [7 8]] [1 2] [1 2]
+        """
+        self.labels = labels
+
+
+    def __iter__(self):
+        # We make a copy here to avoid side-effects during iteration
+        labels = np.array(self.labels, copy=True)
+        for i in np.unique(labels):
+            test_index  = np.zeros(len(labels), dtype=bool)
+            test_index[labels==i] = True
+            train_index = np.logical_not(test_index)
+            yield train_index, test_index
+
+
+    def __repr__(self):
+        return '{}.{}(labels={})'.format(
+                                self.__class__.__module__,
+                                self.__class__.__name__,
+                                self.labels,
+                                )
+
+
+def split(train_indexes, test_indexes, *args):
+    """
+    For each arg return a train and test subsets defined by indexes provided
+    in train_indexes and test_indexes
+    """
+    ret = []
+    for arg in args:
+        arg = np.asanyarray(arg)
+        arg_train = arg[train_indexes]
+        arg_test  = arg[test_indexes]
+        ret.append(arg_train)
+        ret.append(arg_test)
+    return ret
+
+'''
+ >>> cv = cross_val.LeaveOneLabelOut(X, y) # y making y optional and
+possible to add other arrays of the same shape[0] too
+ >>> for X_train, y_train, X_test, y_test in cv:
+ ...      print np.sqrt((model.fit(X_train, y_train).predict(X_test)
+- y_test) ** 2).mean())
+'''
+
+
+################################################################################
+#below: Author: josef-pktd
+
+class KStepAhead:
+    """
+    KStepAhead cross validation iterator:
+    Provides fit/test indexes to split data in sequential sets
+    """
+
+    def __init__(self, n, k=1, start=None, kall=True, return_slice=True):
+        """
+        KStepAhead cross validation iterator:
+        Provides train/test indexes to split data in train test sets
+
+        Parameters
+        ----------
+        n: int
+            Total number of elements
+        k : int
+            number of steps ahead
+        start : int
+            initial size of data for fitting
+        kall : bool
+            if true. all values for up to k-step ahead are included in the test index.
+            If false, then only the k-th step ahead value is returnd
+
+
+        Notes
+        -----
+        I do not think this is really useful, because it can be done with
+        a very simple loop instead.
+        Useful as a plugin, but it could return slices instead for faster array access.
+
+        Examples
+        --------
+        >>> from scikits.learn import cross_val
+        >>> X = [[1, 2], [3, 4]]
+        >>> y = [1, 2]
+        >>> loo = cross_val.LeaveOneOut(2)
+        >>> for train_index, test_index in loo:
+        ...    print "TRAIN:", train_index, "TEST:", test_index
+        ...    X_train, X_test, y_train, y_test = cross_val.split(train_index, test_index, X, y)
+        ...    print X_train, X_test, y_train, y_test
+        TRAIN: [False  True] TEST: [ True False]
+        [[3 4]] [[1 2]] [2] [1]
+        TRAIN: [ True False] TEST: [False  True]
+        [[1 2]] [[3 4]] [1] [2]
+        """
+        self.n = n
+        self.k = k
+        if start is None:
+            start = int(np.trunc(n*0.25)) # pick something arbitrary
+        self.start = start
+        self.kall = kall
+        self.return_slice = return_slice
+
+
+    def __iter__(self):
+        n = self.n
+        k = self.k
+        start = self.start
+        if self.return_slice:
+            for i in range(start, n-k):
+                train_slice = slice(None, i, None)
+                if self.kall:
+                    test_slice = slice(i, i+k)
+                else:
+                    test_slice = slice(i+k-1, i+k)
+                yield train_slice, test_slice
+
+        else: #for compatibility with other iterators
+            for i in range(start, n-k):
+                train_index  = np.zeros(n, dtype=bool)
+                train_index[:i] = True
+                test_index  = np.zeros(n, dtype=bool)
+                if self.kall:
+                    test_index[i:i+k] = True # np.logical_not(test_index)
+                else:
+                    test_index[i+k-1:i+k] = True
+                #or faster to return np.arange(i,i+k) ?
+                #returning slice should be faster in this case
+                yield train_index, test_index
+
+
+    def __repr__(self):
+        return '%s.%s(n=%i)' % (self.__class__.__module__,
+                                self.__class__.__name__,
+                                self.n,
+                                )