""" ============================= Species distribution dataset ============================= This dataset represents the geographic distribution of species. The dataset is provided by Phillips et. al. (2006). The two species are: - `"Bradypus variegatus" `_ , the Brown-throated Sloth. - `"Microryzomys minutus" `_ , also known as the Forest Small Rice Rat, a rodent that lives in Peru, Colombia, Ecuador, Peru, and Venezuela. References ---------- `"Maximum entropy modeling of species geographic distributions" `_ S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. Notes ----- For an example of using this dataset, see :ref:`examples/applications/plot_species_distribution_modeling.py `. """ # Authors: Peter Prettenhofer # Jake Vanderplas # # License: BSD 3 clause import logging from io import BytesIO from os import PathLike, makedirs, remove from os.path import exists import joblib import numpy as np from ..utils import Bunch from ..utils._param_validation import validate_params from . import get_data_home from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath # The original data can be found at: # https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip SAMPLES = RemoteFileMetadata( filename="samples.zip", url="https://ndownloader.figshare.com/files/5976075", checksum="abb07ad284ac50d9e6d20f1c4211e0fd3c098f7f85955e89d321ee8efe37ac28", ) # The original data can be found at: # https://biodiversityinformatics.amnh.org/open_source/maxent/coverages.zip COVERAGES = RemoteFileMetadata( filename="coverages.zip", url="https://ndownloader.figshare.com/files/5976078", checksum="4d862674d72e79d6cee77e63b98651ec7926043ba7d39dcb31329cf3f6073807", ) DATA_ARCHIVE_NAME = "species_coverage.pkz" logger = logging.getLogger(__name__) def _load_coverage(F, header_length=6, dtype=np.int16): """Load a coverage file from an open file object. This will return a numpy array of the given dtype """ header = [F.readline() for _ in range(header_length)] make_tuple = lambda t: (t.split()[0], float(t.split()[1])) header = dict([make_tuple(line) for line in header]) M = np.loadtxt(F, dtype=dtype) nodata = int(header[b"NODATA_value"]) if nodata != -9999: M[nodata] = -9999 return M def _load_csv(F): """Load csv file. Parameters ---------- F : file object CSV file open in byte mode. Returns ------- rec : np.ndarray record array representing the data """ names = F.readline().decode("ascii").strip().split(",") rec = np.loadtxt(F, skiprows=0, delimiter=",", dtype="S22,f4,f4") rec.dtype.names = names return rec def construct_grids(batch): """Construct the map grid from the batch object Parameters ---------- batch : Batch object The object returned by :func:`fetch_species_distributions` Returns ------- (xgrid, ygrid) : 1-D arrays The grid corresponding to the values in batch.coverages """ # x,y coordinates for corner cells xmin = batch.x_left_lower_corner + batch.grid_size xmax = xmin + (batch.Nx * batch.grid_size) ymin = batch.y_left_lower_corner + batch.grid_size ymax = ymin + (batch.Ny * batch.grid_size) # x coordinates of the grid cells xgrid = np.arange(xmin, xmax, batch.grid_size) # y coordinates of the grid cells ygrid = np.arange(ymin, ymax, batch.grid_size) return (xgrid, ygrid) @validate_params( {"data_home": [str, PathLike, None], "download_if_missing": ["boolean"]}, prefer_skip_nested_validation=True, ) def fetch_species_distributions(*, data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006). Read more in the :ref:`User Guide `. Parameters ---------- data_home : str or path-like, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : bool, default=True If False, raise an OSError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. coverages : array, shape = [14, 1592, 1212] These represent the 14 features measured at each point of the map grid. The latitude/longitude values for the grid are discussed below. Missing data is represented by the value -9999. train : record array, shape = (1624,) The training points for the data. Each point has three fields: - train['species'] is the species name - train['dd long'] is the longitude, in degrees - train['dd lat'] is the latitude, in degrees test : record array, shape = (620,) The test points for the data. Same format as the training data. Nx, Ny : integers The number of longitudes (x) and latitudes (y) in the grid x_left_lower_corner, y_left_lower_corner : floats The (x,y) position of the lower-left corner, in degrees grid_size : float The spacing between points of the grid, in degrees Notes ----- This dataset represents the geographic distribution of species. The dataset is provided by Phillips et. al. (2006). The two species are: - `"Bradypus variegatus" `_ , the Brown-throated Sloth. - `"Microryzomys minutus" `_ , also known as the Forest Small Rice Rat, a rodent that lives in Peru, Colombia, Ecuador, Peru, and Venezuela. - For an example of using this dataset with scikit-learn, see :ref:`examples/applications/plot_species_distribution_modeling.py `. References ---------- * `"Maximum entropy modeling of species geographic distributions" `_ S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. Examples -------- >>> from sklearn.datasets import fetch_species_distributions >>> species = fetch_species_distributions() >>> species.train[:5] array([(b'microryzomys_minutus', -64.7 , -17.85 ), (b'microryzomys_minutus', -67.8333, -16.3333), (b'microryzomys_minutus', -67.8833, -16.3 ), (b'microryzomys_minutus', -67.8 , -16.2667), (b'microryzomys_minutus', -67.9833, -15.9 )], dtype=[('species', 'S22'), ('dd long', '