reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/sklearn/datasets/_species_distributions.py
+++ b/venv/lib/python3.11/site-packages/sklearn/datasets/_species_distributions.py
@ -0,0 +1,299 @@
+"""
+=============================
+Species distribution dataset
+=============================
+
+This dataset represents the geographic distribution of species.
+The dataset is provided by Phillips et. al. (2006).
+
+The two species are:
+
+ - `"Bradypus variegatus"
+   <http://www.iucnredlist.org/details/3038/0>`_ ,
+   the Brown-throated Sloth.
+
+ - `"Microryzomys minutus"
+   <http://www.iucnredlist.org/details/13408/0>`_ ,
+   also known as the Forest Small Rice Rat, a rodent that lives in Peru,
+   Colombia, Ecuador, Peru, and Venezuela.
+
+References
+----------
+
+`"Maximum entropy modeling of species geographic distributions"
+<http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips,
+R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006.
+
+Notes
+-----
+
+For an example of using this dataset, see
+:ref:`examples/applications/plot_species_distribution_modeling.py
+<sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
+"""
+
+# Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Jake Vanderplas <vanderplas@astro.washington.edu>
+#
+# License: BSD 3 clause
+
+import logging
+from io import BytesIO
+from numbers import Integral, Real
+from os import PathLike, makedirs, remove
+from os.path import exists
+
+import joblib
+import numpy as np
+
+from ..utils import Bunch
+from ..utils._param_validation import Interval, validate_params
+from . import get_data_home
+from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath
+
+# The original data can be found at:
+# https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip
+SAMPLES = RemoteFileMetadata(
+    filename="samples.zip",
+    url="https://ndownloader.figshare.com/files/5976075",
+    checksum="abb07ad284ac50d9e6d20f1c4211e0fd3c098f7f85955e89d321ee8efe37ac28",
+)
+
+# The original data can be found at:
+# https://biodiversityinformatics.amnh.org/open_source/maxent/coverages.zip
+COVERAGES = RemoteFileMetadata(
+    filename="coverages.zip",
+    url="https://ndownloader.figshare.com/files/5976078",
+    checksum="4d862674d72e79d6cee77e63b98651ec7926043ba7d39dcb31329cf3f6073807",
+)
+
+DATA_ARCHIVE_NAME = "species_coverage.pkz"
+
+
+logger = logging.getLogger(__name__)
+
+
+def _load_coverage(F, header_length=6, dtype=np.int16):
+    """Load a coverage file from an open file object.
+
+    This will return a numpy array of the given dtype
+    """
+    header = [F.readline() for _ in range(header_length)]
+    make_tuple = lambda t: (t.split()[0], float(t.split()[1]))
+    header = dict([make_tuple(line) for line in header])
+
+    M = np.loadtxt(F, dtype=dtype)
+    nodata = int(header[b"NODATA_value"])
+    if nodata != -9999:
+        M[nodata] = -9999
+    return M
+
+
+def _load_csv(F):
+    """Load csv file.
+
+    Parameters
+    ----------
+    F : file object
+        CSV file open in byte mode.
+
+    Returns
+    -------
+    rec : np.ndarray
+        record array representing the data
+    """
+    names = F.readline().decode("ascii").strip().split(",")
+
+    rec = np.loadtxt(F, skiprows=0, delimiter=",", dtype="S22,f4,f4")
+    rec.dtype.names = names
+    return rec
+
+
+def construct_grids(batch):
+    """Construct the map grid from the batch object
+
+    Parameters
+    ----------
+    batch : Batch object
+        The object returned by :func:`fetch_species_distributions`
+
+    Returns
+    -------
+    (xgrid, ygrid) : 1-D arrays
+        The grid corresponding to the values in batch.coverages
+    """
+    # x,y coordinates for corner cells
+    xmin = batch.x_left_lower_corner + batch.grid_size
+    xmax = xmin + (batch.Nx * batch.grid_size)
+    ymin = batch.y_left_lower_corner + batch.grid_size
+    ymax = ymin + (batch.Ny * batch.grid_size)
+
+    # x coordinates of the grid cells
+    xgrid = np.arange(xmin, xmax, batch.grid_size)
+    # y coordinates of the grid cells
+    ygrid = np.arange(ymin, ymax, batch.grid_size)
+
+    return (xgrid, ygrid)
+
+
+@validate_params(
+    {
+        "data_home": [str, PathLike, None],
+        "download_if_missing": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_species_distributions(
+    *,
+    data_home=None,
+    download_if_missing=True,
+    n_retries=3,
+    delay=1.0,
+):
+    """Loader for species distribution dataset from Phillips et. al. (2006).
+
+    Read more in the :ref:`User Guide <species_distribution_dataset>`.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        coverages : array, shape = [14, 1592, 1212]
+            These represent the 14 features measured
+            at each point of the map grid.
+            The latitude/longitude values for the grid are discussed below.
+            Missing data is represented by the value -9999.
+        train : record array, shape = (1624,)
+            The training points for the data.  Each point has three fields:
+
+            - train['species'] is the species name
+            - train['dd long'] is the longitude, in degrees
+            - train['dd lat'] is the latitude, in degrees
+        test : record array, shape = (620,)
+            The test points for the data.  Same format as the training data.
+        Nx, Ny : integers
+            The number of longitudes (x) and latitudes (y) in the grid
+        x_left_lower_corner, y_left_lower_corner : floats
+            The (x,y) position of the lower-left corner, in degrees
+        grid_size : float
+            The spacing between points of the grid, in degrees
+
+    Notes
+    -----
+
+    This dataset represents the geographic distribution of species.
+    The dataset is provided by Phillips et. al. (2006).
+
+    The two species are:
+
+    - `"Bradypus variegatus"
+      <http://www.iucnredlist.org/details/3038/0>`_ ,
+      the Brown-throated Sloth.
+
+    - `"Microryzomys minutus"
+      <http://www.iucnredlist.org/details/13408/0>`_ ,
+      also known as the Forest Small Rice Rat, a rodent that lives in Peru,
+      Colombia, Ecuador, Peru, and Venezuela.
+
+    - For an example of using this dataset with scikit-learn, see
+      :ref:`examples/applications/plot_species_distribution_modeling.py
+      <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
+
+    References
+    ----------
+
+    * `"Maximum entropy modeling of species geographic distributions"
+      <http://rob.schapire.net/papers/ecolmod.pdf>`_
+      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
+      190:231-259, 2006.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_species_distributions
+    >>> species = fetch_species_distributions()
+    >>> species.train[:5]
+    array([(b'microryzomys_minutus', -64.7   , -17.85  ),
+           (b'microryzomys_minutus', -67.8333, -16.3333),
+           (b'microryzomys_minutus', -67.8833, -16.3   ),
+           (b'microryzomys_minutus', -67.8   , -16.2667),
+           (b'microryzomys_minutus', -67.9833, -15.9   )],
+          dtype=[('species', 'S22'), ('dd long', '<f4'), ('dd lat', '<f4')])
+    """
+    data_home = get_data_home(data_home)
+    if not exists(data_home):
+        makedirs(data_home)
+
+    # Define parameters for the data files.  These should not be changed
+    # unless the data model changes.  They will be saved in the npz file
+    # with the downloaded data.
+    extra_params = dict(
+        x_left_lower_corner=-94.8,
+        Nx=1212,
+        y_left_lower_corner=-56.05,
+        Ny=1592,
+        grid_size=0.05,
+    )
+    dtype = np.int16
+
+    archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME)
+
+    if not exists(archive_path):
+        if not download_if_missing:
+            raise OSError("Data not found and `download_if_missing` is False")
+        logger.info("Downloading species data from %s to %s" % (SAMPLES.url, data_home))
+        samples_path = _fetch_remote(
+            SAMPLES, dirname=data_home, n_retries=n_retries, delay=delay
+        )
+        with np.load(samples_path) as X:  # samples.zip is a valid npz
+            for f in X.files:
+                fhandle = BytesIO(X[f])
+                if "train" in f:
+                    train = _load_csv(fhandle)
+                if "test" in f:
+                    test = _load_csv(fhandle)
+        remove(samples_path)
+
+        logger.info(
+            "Downloading coverage data from %s to %s" % (COVERAGES.url, data_home)
+        )
+        coverages_path = _fetch_remote(
+            COVERAGES, dirname=data_home, n_retries=n_retries, delay=delay
+        )
+        with np.load(coverages_path) as X:  # coverages.zip is a valid npz
+            coverages = []
+            for f in X.files:
+                fhandle = BytesIO(X[f])
+                logger.debug(" - converting {}".format(f))
+                coverages.append(_load_coverage(fhandle))
+            coverages = np.asarray(coverages, dtype=dtype)
+        remove(coverages_path)
+
+        bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params)
+        joblib.dump(bunch, archive_path, compress=9)
+    else:
+        bunch = joblib.load(archive_path)
+
+    return bunch