reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,299 @@
|
||||
"""
|
||||
=============================
|
||||
Species distribution dataset
|
||||
=============================
|
||||
|
||||
This dataset represents the geographic distribution of species.
|
||||
The dataset is provided by Phillips et. al. (2006).
|
||||
|
||||
The two species are:
|
||||
|
||||
- `"Bradypus variegatus"
|
||||
<http://www.iucnredlist.org/details/3038/0>`_ ,
|
||||
the Brown-throated Sloth.
|
||||
|
||||
- `"Microryzomys minutus"
|
||||
<http://www.iucnredlist.org/details/13408/0>`_ ,
|
||||
also known as the Forest Small Rice Rat, a rodent that lives in Peru,
|
||||
Colombia, Ecuador, Peru, and Venezuela.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
`"Maximum entropy modeling of species geographic distributions"
|
||||
<http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips,
|
||||
R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
For an example of using this dataset, see
|
||||
:ref:`examples/applications/plot_species_distribution_modeling.py
|
||||
<sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
|
||||
"""
|
||||
|
||||
# Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
||||
# Jake Vanderplas <vanderplas@astro.washington.edu>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from numbers import Integral, Real
|
||||
from os import PathLike, makedirs, remove
|
||||
from os.path import exists
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
|
||||
from ..utils import Bunch
|
||||
from ..utils._param_validation import Interval, validate_params
|
||||
from . import get_data_home
|
||||
from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath
|
||||
|
||||
# The original data can be found at:
|
||||
# https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip
|
||||
SAMPLES = RemoteFileMetadata(
|
||||
filename="samples.zip",
|
||||
url="https://ndownloader.figshare.com/files/5976075",
|
||||
checksum="abb07ad284ac50d9e6d20f1c4211e0fd3c098f7f85955e89d321ee8efe37ac28",
|
||||
)
|
||||
|
||||
# The original data can be found at:
|
||||
# https://biodiversityinformatics.amnh.org/open_source/maxent/coverages.zip
|
||||
COVERAGES = RemoteFileMetadata(
|
||||
filename="coverages.zip",
|
||||
url="https://ndownloader.figshare.com/files/5976078",
|
||||
checksum="4d862674d72e79d6cee77e63b98651ec7926043ba7d39dcb31329cf3f6073807",
|
||||
)
|
||||
|
||||
DATA_ARCHIVE_NAME = "species_coverage.pkz"
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _load_coverage(F, header_length=6, dtype=np.int16):
|
||||
"""Load a coverage file from an open file object.
|
||||
|
||||
This will return a numpy array of the given dtype
|
||||
"""
|
||||
header = [F.readline() for _ in range(header_length)]
|
||||
make_tuple = lambda t: (t.split()[0], float(t.split()[1]))
|
||||
header = dict([make_tuple(line) for line in header])
|
||||
|
||||
M = np.loadtxt(F, dtype=dtype)
|
||||
nodata = int(header[b"NODATA_value"])
|
||||
if nodata != -9999:
|
||||
M[nodata] = -9999
|
||||
return M
|
||||
|
||||
|
||||
def _load_csv(F):
|
||||
"""Load csv file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
F : file object
|
||||
CSV file open in byte mode.
|
||||
|
||||
Returns
|
||||
-------
|
||||
rec : np.ndarray
|
||||
record array representing the data
|
||||
"""
|
||||
names = F.readline().decode("ascii").strip().split(",")
|
||||
|
||||
rec = np.loadtxt(F, skiprows=0, delimiter=",", dtype="S22,f4,f4")
|
||||
rec.dtype.names = names
|
||||
return rec
|
||||
|
||||
|
||||
def construct_grids(batch):
|
||||
"""Construct the map grid from the batch object
|
||||
|
||||
Parameters
|
||||
----------
|
||||
batch : Batch object
|
||||
The object returned by :func:`fetch_species_distributions`
|
||||
|
||||
Returns
|
||||
-------
|
||||
(xgrid, ygrid) : 1-D arrays
|
||||
The grid corresponding to the values in batch.coverages
|
||||
"""
|
||||
# x,y coordinates for corner cells
|
||||
xmin = batch.x_left_lower_corner + batch.grid_size
|
||||
xmax = xmin + (batch.Nx * batch.grid_size)
|
||||
ymin = batch.y_left_lower_corner + batch.grid_size
|
||||
ymax = ymin + (batch.Ny * batch.grid_size)
|
||||
|
||||
# x coordinates of the grid cells
|
||||
xgrid = np.arange(xmin, xmax, batch.grid_size)
|
||||
# y coordinates of the grid cells
|
||||
ygrid = np.arange(ymin, ymax, batch.grid_size)
|
||||
|
||||
return (xgrid, ygrid)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"data_home": [str, PathLike, None],
|
||||
"download_if_missing": ["boolean"],
|
||||
"n_retries": [Interval(Integral, 1, None, closed="left")],
|
||||
"delay": [Interval(Real, 0.0, None, closed="neither")],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def fetch_species_distributions(
|
||||
*,
|
||||
data_home=None,
|
||||
download_if_missing=True,
|
||||
n_retries=3,
|
||||
delay=1.0,
|
||||
):
|
||||
"""Loader for species distribution dataset from Phillips et. al. (2006).
|
||||
|
||||
Read more in the :ref:`User Guide <species_distribution_dataset>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data_home : str or path-like, default=None
|
||||
Specify another download and cache folder for the datasets. By default
|
||||
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
|
||||
|
||||
download_if_missing : bool, default=True
|
||||
If False, raise an OSError if the data is not locally available
|
||||
instead of trying to download the data from the source site.
|
||||
|
||||
n_retries : int, default=3
|
||||
Number of retries when HTTP errors are encountered.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
|
||||
delay : float, default=1.0
|
||||
Number of seconds between retries.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : :class:`~sklearn.utils.Bunch`
|
||||
Dictionary-like object, with the following attributes.
|
||||
|
||||
coverages : array, shape = [14, 1592, 1212]
|
||||
These represent the 14 features measured
|
||||
at each point of the map grid.
|
||||
The latitude/longitude values for the grid are discussed below.
|
||||
Missing data is represented by the value -9999.
|
||||
train : record array, shape = (1624,)
|
||||
The training points for the data. Each point has three fields:
|
||||
|
||||
- train['species'] is the species name
|
||||
- train['dd long'] is the longitude, in degrees
|
||||
- train['dd lat'] is the latitude, in degrees
|
||||
test : record array, shape = (620,)
|
||||
The test points for the data. Same format as the training data.
|
||||
Nx, Ny : integers
|
||||
The number of longitudes (x) and latitudes (y) in the grid
|
||||
x_left_lower_corner, y_left_lower_corner : floats
|
||||
The (x,y) position of the lower-left corner, in degrees
|
||||
grid_size : float
|
||||
The spacing between points of the grid, in degrees
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
This dataset represents the geographic distribution of species.
|
||||
The dataset is provided by Phillips et. al. (2006).
|
||||
|
||||
The two species are:
|
||||
|
||||
- `"Bradypus variegatus"
|
||||
<http://www.iucnredlist.org/details/3038/0>`_ ,
|
||||
the Brown-throated Sloth.
|
||||
|
||||
- `"Microryzomys minutus"
|
||||
<http://www.iucnredlist.org/details/13408/0>`_ ,
|
||||
also known as the Forest Small Rice Rat, a rodent that lives in Peru,
|
||||
Colombia, Ecuador, Peru, and Venezuela.
|
||||
|
||||
- For an example of using this dataset with scikit-learn, see
|
||||
:ref:`examples/applications/plot_species_distribution_modeling.py
|
||||
<sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
* `"Maximum entropy modeling of species geographic distributions"
|
||||
<http://rob.schapire.net/papers/ecolmod.pdf>`_
|
||||
S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
|
||||
190:231-259, 2006.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import fetch_species_distributions
|
||||
>>> species = fetch_species_distributions()
|
||||
>>> species.train[:5]
|
||||
array([(b'microryzomys_minutus', -64.7 , -17.85 ),
|
||||
(b'microryzomys_minutus', -67.8333, -16.3333),
|
||||
(b'microryzomys_minutus', -67.8833, -16.3 ),
|
||||
(b'microryzomys_minutus', -67.8 , -16.2667),
|
||||
(b'microryzomys_minutus', -67.9833, -15.9 )],
|
||||
dtype=[('species', 'S22'), ('dd long', '<f4'), ('dd lat', '<f4')])
|
||||
"""
|
||||
data_home = get_data_home(data_home)
|
||||
if not exists(data_home):
|
||||
makedirs(data_home)
|
||||
|
||||
# Define parameters for the data files. These should not be changed
|
||||
# unless the data model changes. They will be saved in the npz file
|
||||
# with the downloaded data.
|
||||
extra_params = dict(
|
||||
x_left_lower_corner=-94.8,
|
||||
Nx=1212,
|
||||
y_left_lower_corner=-56.05,
|
||||
Ny=1592,
|
||||
grid_size=0.05,
|
||||
)
|
||||
dtype = np.int16
|
||||
|
||||
archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME)
|
||||
|
||||
if not exists(archive_path):
|
||||
if not download_if_missing:
|
||||
raise OSError("Data not found and `download_if_missing` is False")
|
||||
logger.info("Downloading species data from %s to %s" % (SAMPLES.url, data_home))
|
||||
samples_path = _fetch_remote(
|
||||
SAMPLES, dirname=data_home, n_retries=n_retries, delay=delay
|
||||
)
|
||||
with np.load(samples_path) as X: # samples.zip is a valid npz
|
||||
for f in X.files:
|
||||
fhandle = BytesIO(X[f])
|
||||
if "train" in f:
|
||||
train = _load_csv(fhandle)
|
||||
if "test" in f:
|
||||
test = _load_csv(fhandle)
|
||||
remove(samples_path)
|
||||
|
||||
logger.info(
|
||||
"Downloading coverage data from %s to %s" % (COVERAGES.url, data_home)
|
||||
)
|
||||
coverages_path = _fetch_remote(
|
||||
COVERAGES, dirname=data_home, n_retries=n_retries, delay=delay
|
||||
)
|
||||
with np.load(coverages_path) as X: # coverages.zip is a valid npz
|
||||
coverages = []
|
||||
for f in X.files:
|
||||
fhandle = BytesIO(X[f])
|
||||
logger.debug(" - converting {}".format(f))
|
||||
coverages.append(_load_coverage(fhandle))
|
||||
coverages = np.asarray(coverages, dtype=dtype)
|
||||
remove(coverages_path)
|
||||
|
||||
bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params)
|
||||
joblib.dump(bunch, archive_path, compress=9)
|
||||
else:
|
||||
bunch = joblib.load(archive_path)
|
||||
|
||||
return bunch
|
||||
Reference in New Issue
Block a user