
    h$,f#                     :   d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
Z
ddlZddlmZ ddlmZ d	d
lmZ d	dlmZmZmZ  eddd      Z eddd      ZdZ ej4                  e      Zdej:                  fdZd Zd Z  ee!edgdgdd      dddd       Z"y)a  
=============================
Species distribution dataset
=============================

This dataset represents the geographic distribution of species.
The dataset is provided by Phillips et. al. (2006).

The two species are:

 - `"Bradypus variegatus"
   <http://www.iucnredlist.org/details/3038/0>`_ ,
   the Brown-throated Sloth.

 - `"Microryzomys minutus"
   <http://www.iucnredlist.org/details/13408/0>`_ ,
   also known as the Forest Small Rice Rat, a rodent that lives in Peru,
   Colombia, Ecuador, Peru, and Venezuela.

References
----------

`"Maximum entropy modeling of species geographic distributions"
<http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips,
R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006.

Notes
-----

For an example of using this dataset, see
:ref:`examples/applications/plot_species_distribution_modeling.py
<sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
    N)BytesIO)PathLikemakedirsremove)exists   )Bunch)validate_params   )get_data_home)RemoteFileMetadata_fetch_remote_pkl_filepathzsamples.zipz.https://ndownloader.figshare.com/files/5976075@abb07ad284ac50d9e6d20f1c4211e0fd3c098f7f85955e89d321ee8efe37ac28)filenameurlchecksumzcoverages.zipz.https://ndownloader.figshare.com/files/5976078@4d862674d72e79d6cee77e63b98651ec7926043ba7d39dcb31329cf3f6073807zspecies_coverage.pkz   c                    t        |      D cg c]  }| j                          }}d }t        |D cg c]
  } ||       c}      }t        j                  | |      }t        |d         }|dk7  rd||<   |S c c}w c c}w )zjLoad a coverage file from an open file object.

    This will return a numpy array of the given dtype
    c                 `    | j                         d   t        | j                         d         fS )Nr   r   )splitfloat)ts    Glib/python3.12/site-packages/sklearn/datasets/_species_distributions.py<lambda>z _load_coverage.<locals>.<lambda>Q   s$    AGGIaL%	!*=>     dtypes   NODATA_valuei)rangereadlinedictnploadtxtint)	Fheader_lengthr   _header
make_tuplelineMnodatas	            r   _load_coverager.   K   s}    
 %*-$89qajjl9F9>J7:d#78F


1E"A()F&	H :7s
   A:A?c                     | j                         j                  d      j                         j                  d      }t	        j
                  | ddd      }||j                  _        |S )zLoad csv file.

    Parameters
    ----------
    F : file object
        CSV file open in byte mode.

    Returns
    -------
    rec : np.ndarray
        record array representing the data
    ascii,r   z	S22,f4,f4)skiprows	delimiterr   )r!   decodestripr   r#   r$   r   names)r&   r6   recs      r   	_load_csvr8   [   sR     JJL(..066s;E
**Qc
ECCIIOJr   c                 b   | j                   | j                  z   }|| j                  | j                  z  z   }| j                  | j                  z   }|| j                  | j                  z  z   }t        j                  ||| j                        }t        j                  ||| j                        }||fS )a%  Construct the map grid from the batch object

    Parameters
    ----------
    batch : Batch object
        The object returned by :func:`fetch_species_distributions`

    Returns
    -------
    (xgrid, ygrid) : 1-D arrays
        The grid corresponding to the values in batch.coverages
    )x_left_lower_corner	grid_sizeNxy_left_lower_cornerNyr#   arange)batchxminxmaxyminymaxxgridygrids          r   construct_gridsrG   o   s     $$u6D588eoo-.D$$u6D588eoo-.D IIdD%//2EIIdD%//2E5>r   boolean)	data_homedownload_if_missingT)prefer_skip_nested_validationc                 ~   t        |       } t        |       st        |        t        ddddd      }t        j
                  }t        | t              }t        |      s|st        d      t        j                  dt        j                  d	|        t        t        | 
      }t	        j                  |      5 }|j                  D ]/  }t!        ||         }d|v rt#        |      }	d|v s%t#        |      }
1 	 ddd       t%        |       t        j                  dt&        j                  d	|        t        t&        | 
      }t	        j                  |      5 }g }|j                  D ]N  }t!        ||         }t        j)                  dj+                  |             |j-                  t/        |             P t	        j0                  ||      }ddd       t%        |       t3        d
	d|}t5        j6                  ||d       |S t5        j                  |      }|S # 1 sw Y   .xY w# 1 sw Y   axY w)a  Loader for species distribution dataset from Phillips et. al. (2006).

    Read more in the :ref:`User Guide <species_distribution_dataset>`.

    Parameters
    ----------
    data_home : str or path-like, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    -------
    data : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        coverages : array, shape = [14, 1592, 1212]
            These represent the 14 features measured
            at each point of the map grid.
            The latitude/longitude values for the grid are discussed below.
            Missing data is represented by the value -9999.
        train : record array, shape = (1624,)
            The training points for the data.  Each point has three fields:

            - train['species'] is the species name
            - train['dd long'] is the longitude, in degrees
            - train['dd lat'] is the latitude, in degrees
        test : record array, shape = (620,)
            The test points for the data.  Same format as the training data.
        Nx, Ny : integers
            The number of longitudes (x) and latitudes (y) in the grid
        x_left_lower_corner, y_left_lower_corner : floats
            The (x,y) position of the lower-left corner, in degrees
        grid_size : float
            The spacing between points of the grid, in degrees

    Notes
    -----

    This dataset represents the geographic distribution of species.
    The dataset is provided by Phillips et. al. (2006).

    The two species are:

    - `"Bradypus variegatus"
      <http://www.iucnredlist.org/details/3038/0>`_ ,
      the Brown-throated Sloth.

    - `"Microryzomys minutus"
      <http://www.iucnredlist.org/details/13408/0>`_ ,
      also known as the Forest Small Rice Rat, a rodent that lives in Peru,
      Colombia, Ecuador, Peru, and Venezuela.

    - For an example of using this dataset with scikit-learn, see
      :ref:`examples/applications/plot_species_distribution_modeling.py
      <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.

    References
    ----------

    * `"Maximum entropy modeling of species geographic distributions"
      <http://rob.schapire.net/papers/ecolmod.pdf>`_
      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
      190:231-259, 2006.

    Examples
    --------
    >>> from sklearn.datasets import fetch_species_distributions
    >>> species = fetch_species_distributions()
    >>> species.train[:5]
    array([(b'microryzomys_minutus', -64.7   , -17.85  ),
           (b'microryzomys_minutus', -67.8333, -16.3333),
           (b'microryzomys_minutus', -67.8833, -16.3   ),
           (b'microryzomys_minutus', -67.8   , -16.2667),
           (b'microryzomys_minutus', -67.9833, -15.9   )],
          dtype=[('species', 'S22'), ('dd long', '<f4'), ('dd lat', '<f4')])
    g33333Wi  gfffffLi8  g?)r:   r<   r=   r>   r;   z1Data not found and `download_if_missing` is FalsezDownloading species data from z to )dirnametraintestNzDownloading coverage data from z - converting {}r   )	coveragesrO   rN   	   )compress )r   r   r   r"   r#   int16r   DATA_ARCHIVE_NAMEOSErrorloggerinfoSAMPLESr   r   loadfilesr   r8   r   	COVERAGESdebugformatappendr.   asarrayr	   joblibdump)rI   rJ   extra_paramsr   archive_pathsamples_pathXffhandlerN   rO   coverages_pathrP   bunchs                 r   fetch_species_distributionsrk      s   j i(I)
 !"L HHE ,=>L,"MNNYWX$Wi@WW\" 	.aWW .!!A$-a<%g.EQ;$W-D.	. 	|9B	R	
 'y)DWW^$ 	;IWW :!!A$-/66q9:  !89: 

9E:I	; 	~R	ER\RE<!4 L L)L7	. 	.	; 	;s   :1H&,H&A7H3&H03H<)#__doc__loggingior   osr   r   r   os.pathr   ra   numpyr#   utilsr	   utils._param_validationr
    r   _baser   r   r   rY   r\   rU   	getLogger__name__rW   rT   r.   r8   rG   strrk   rS   r   r   <module>ry      s    N   ) )     5  C C 8O 8O	 +  
		8	$ %&RXX  (6 $'L"& .2t C	Cr   