
    h$,fO                        d Z ddlZddlmZmZ ddlmZmZmZm	Z	 ddl
mZmZmZ ddlZddlmZ ddlmZ dd	lmZmZmZmZ d
dlmZmZmZmZ  ej<                  e      Z  eddd      Z! eddd      Z" eddd       eddd       eddd      fZ#d2dZ$d Z%	 d3d Z& ee'edgd!g eeddd"#      dg eeddd$#      dgd!ge( ed      gd!gd!gd%d&      ddd'dd e)d(d)       e)d*d+      fddd%d,       Z*	 d4d-Z+ e eh d.      ge'edgd!g eeddd"#      dgd!ge( ed      gd!gd/d&      d0ddd'd e)d(d)       e)d*d+      fdd/d1       Z,y)5zLabeled Faces in the Wild (LFW) dataset

This dataset is a collection of JPEG pictures of famous people collected
over the internet, all details are available on the official website:

    http://vis-www.cs.umass.edu/lfw/
    N)IntegralReal)PathLikelistdirmakedirsremove)existsisdirjoin)Memory   )Bunch)HiddenInterval
StrOptionsvalidate_params   )RemoteFileMetadata_fetch_remoteget_data_home
load_descrzlfw.tgzz.https://ndownloader.figshare.com/files/5976018@055f7d9c632d7370e6fb4afc7468d40f970c34a80d4c6f50ffec63f5a8d536c0)filenameurlchecksumzlfw-funneled.tgzz.https://ndownloader.figshare.com/files/5976015@b47c8422c8cded889dc5a13418c4bc2abbda121092b3533a83306f90d900100apairsDevTrain.txtz.https://ndownloader.figshare.com/files/5976012@1d454dada7dfeca0e7eab6f65dc4e97a6312d44cf142207be28d688be92aabfapairsDevTest.txtz.https://ndownloader.figshare.com/files/5976009@7cb06600ea8b2814ac26e946201cdb304296262aad67d046a16a7ec85d0ff87c	pairs.txtz.https://ndownloader.figshare.com/files/5976006@ea42330c62c92989f9d7c03237ed5d591365e89b3e649747777b70e692dc1592Tc                    t        |       } t        | d      }t        |      st        |       t        D ]a  }t        ||j
                        }t        |      r%|r.t        j                  d|j                         t        ||       Ut        d|z         |rt        |d      }t        }nt        |d      }t        }t        |      st        ||j
                        }t        |      s>|r.t        j                  d|j                         t        ||       nt        d|z        d	d
l}	t        j                  d|       |	j                  |d      j!                  |       t#        |       ||fS )z0Helper function to download any missing LFW data)	data_homelfw_homezDownloading LFW metadata: %s)dirnamez%s is missinglfw_funneledlfwz!Downloading LFW data (~200MB): %sr   Nz$Decompressing the data archive to %szr:gz)path)r   r   r	   r   TARGETSr   loggerinfor   r   OSErrorFUNNELED_ARCHIVEARCHIVEtarfiledebugopen
extractallr   )
r$   funneleddownload_if_missingr%   targettarget_filepathdata_folder_patharchivearchive_pathr0   s
             5lib/python3.12/site-packages/sklearn/datasets/_lfw.py_check_fetch_lfwr<   K   sB    	2IIz*H( Ax9o&":FJJGfh7o?@@A .9"%0"#Hg&6&67l#"?Mgx8o<==;=MN\6*5585D|%%%    c                 l   	 ddl m} t        dd      t        dd      f}||}nt	        d t        ||      D              }|\  }}|j                  |j                  z
  |j                  xs dz  }|j                  |j                  z
  |j                  xs dz  }	|'t        |      }t        ||z        }t        ||	z        }	t        |       }
|s)t        j                  |
||	ft        j                        }n)t        j                  |
||	dft        j                        }t        |       D ]  \  }}|d	z  dk(  rt         j#                  d
|dz   |
       |j%                  |      }|j'                  |j                  |j                  |j                  |j                  f      }||j)                  |	|f      }t        j*                  |t        j                        }|j,                  dk(  rt/        d|z        |dz  }|s|j1                  d      }|||df<    |S # t        $ r t        d      w xY w)zInternally used to load imagesr   )ImagezThe Python Imaging Library (PIL) is required to load data from jpeg files. Please refer to https://pillow.readthedocs.io/en/stable/installation.html for installing PIL.   c              3   .   K   | ]  \  }}|xs |  y w)N ).0sdss      r;   	<genexpr>z_load_imgs.<locals>.<genexpr>   s     G51bqwBwGs   r   dtype   i  zLoading face #%05d / %05dzLFailed to read the image file %s, Please make sure that libjpeg is installedg     o@r   )axis.)PILr?   ImportErrorslicetuplezipstopstartstepfloatintlennpzerosfloat32	enumerater+   r1   r2   cropresizeasarrayndimRuntimeErrormean)
file_pathsslice_colorr[   r?   default_sliceh_slicew_slicehwn_facesfacesi	file_pathpil_imgfaces                   r;   
_load_imgsrn   v   s   
 1c]E!SM2M~GC,FGGGW		%7<<+<1=A		%7<<+<1=Av
O
O *oG'1a

;'1a+2::> "*- 9t8q=LL4a!eWE **Y',,]]GMM7<<F
 nnaV,Gzz'499>=?HI 
 	 99!9$Daf58 L}  
"
 	

s   H H3Fc                    g g }}t        t        |             D ]  }t        | |      }t        |      st        t        |            D 	cg c]  }	t        ||	       }
}	t	        |
      }||k\  sW|j                  dd      }|j                  |g|z         |j                  |
        t	        |      }|dk(  rt        d|z        t        j                  |      }t        j                  ||      }t        ||||      }t        j                  |      }t        j                  j                  d      j                  |       ||   ||   }}|||fS c c}	w )z~Perform the actual data loading for the lfw people dataset

    This operation is meant to be cached by a joblib wrapper.
    _ r   z*min_faces_per_person=%d is too restrictive*   )sortedr   r   r
   rU   replaceextend
ValueErrorrV   uniquesearchsortedrn   arangerandomRandomStateshuffle)r8   ra   rb   r[   min_faces_per_personperson_namesr`   person_namefolder_pathfpaths
n_picturesrh   target_namesr6   ri   indicess                    r;   _fetch_lfw_peopler      sS     "2*Lg&678 	%+[9[!/5gk6J/KL!k1%LLZ
--%--c37K
 :;e$	% *oG!|8;OO
 	
 99\*L__\<8Fz65&9E ii GII"%%g.'NF7O6E&,&&5 Ms   
Ebooleanneither)closedleft)r$   r4   r[   r}   rb   ra   r5   
return_X_y)prefer_skip_nested_validationg      ?F      N      c                 0   t        | ||      \  }}	t        j                  d|       t        |dd      }
|
j	                  t
              } ||	||||      \  }}}|j                  t        |      d      }t        d      }|r||fS t        |||||	      S )
a  Load the Labeled Faces in the Wild (LFW) people dataset (classification).

    Download it if necessary.

    =================   =======================
    Classes                                5749
    Samples total                         13233
    Dimensionality                         5828
    Features            real, between 0 and 255
    =================   =======================

    Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`.

    Parameters
    ----------
    data_home : str or path-like, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    funneled : bool, default=True
        Download and use the funneled variant of the dataset.

    resize : float or None, default=0.5
        Ratio used to resize the each face picture. If `None`, no resizing is
        performed.

    min_faces_per_person : int, default=None
        The extracted dataset will only retain pictures of people that have at
        least `min_faces_per_person` different pictures.

    color : bool, default=False
        Keep the 3 RGB channels instead of averaging them to a single
        gray level channel. If color is True the shape of the data has
        one more dimension than the shape with color = False.

    slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))
        Provide a custom 2D slice (height, width) to extract the
        'interesting' part of the jpeg files and avoid use statistical
        correlation from the background.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    return_X_y : bool, default=False
        If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch
        object. See below for more information about the `dataset.data` and
        `dataset.target` object.

        .. versionadded:: 0.20

    Returns
    -------
    dataset : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : numpy array of shape (13233, 2914)
            Each row corresponds to a ravelled face image
            of original size 62 x 47 pixels.
            Changing the ``slice_`` or resize parameters will change the
            shape of the output.
        images : numpy array of shape (13233, 62, 47)
            Each row is a face image corresponding to one of the 5749 people in
            the dataset. Changing the ``slice_``
            or resize parameters will change the shape of the output.
        target : numpy array of shape (13233,)
            Labels associated to each face image.
            Those labels range from 0-5748 and correspond to the person IDs.
        target_names : numpy array of shape (5749,)
            Names of all persons in the dataset.
            Position in array corresponds to the person ID in the target array.
        DESCR : str
            Description of the Labeled Faces in the Wild (LFW) dataset.

    (data, target) : tuple if ``return_X_y`` is True
        A tuple of two ndarray. The first containing a 2D array of
        shape (n_samples, n_features) with each row representing one
        sample and each column representing the features. The second
        ndarray of shape (n_samples,) containing the target samples.

        .. versionadded:: 0.20
    r$   r4   r5   z Loading LFW people faces from %s   r   locationcompressverbose)r[   r}   rb   ra   lfw.rst)dataimagesr6   r   DESCR)
r<   r+   r1   r   cacher   reshaperU   r   r   )r$   r4   r[   r}   rb   ra   r5   r   r%   r8   m	load_funcri   r6   r   Xfdescrs                    r;   fetch_lfw_peopler      s    V "2hDW"H LL3X> 	1a8A)*I #,1#E6< 	c%j"%A	"F&y uV,f r=   c           
         t        | d      5 }|D cg c]/  }|j                         j                         j                  d      1 }}ddd       D cg c]  }t	        |      dkD  s| }	}t	        |	      }
t        j                  |
t              }t               }t        |	      D ]  \  }}t	        |      dk(  r2d||<   |d   t        |d         dz
  f|d   t        |d         dz
  ff}nSt	        |      d	k(  r2d||<   |d   t        |d         dz
  f|d   t        |d         dz
  ff}nt        d
|dz   |fz        t        |      D ]R  \  }\  }}	 t        ||      }t        t        t        |                  }t        |||         }|j!                  |       T  t#        ||||      }t        |j$                        }|j'                  d      }|j)                  dd       |j)                  d|dz         ||_        ||t        j*                  ddg      fS c c}w # 1 sw Y   xY wc c}w # t        $ r t        |t        |d            }Y w xY w)z}Perform the actual data loading for the LFW pairs dataset

    This operation is meant to be cached by a joblib wrapper.
    rb	Nr   rG   rI   r   r      zinvalid line %d: %rzUTF-8zDifferent personszSame person)r2   decodestripsplitrU   rV   rW   rT   listrY   rv   r   	TypeErrorstrrs   r   appendrn   shapepopinsertarray)index_file_pathr8   ra   rb   r[   
index_filelnsplit_linessl
pair_specsn_pairsr6   r`   rj   
componentspairjnameidxperson_folder	filenamesrk   pairsr   rh   s                            r;   _fetch_lfw_pairsr   {  sm    
ot	$ M
AKL2ryy{((*006LLM*:c"gk":J:*oG XXgS)FJ":. ):z?aF1IAJqM 2Q 67AJqM 2Q 67D _!F1IAJqM 2Q 67AJqM 2Q 67D
 2a!eZ5HHII'o 	)NA{cK $%5t < VGM$:;<I]IcN;Ii(	))0 z65&9EEiilG	LLA	LLGqL!EK&"(($7#GHHHO MM M:2  K $%5s47I JKs:   H%4H H%H2(H2H7 H%%H/7II>   testtrain10_folds)subsetr$   r4   r[   rb   ra   r5   r   c                    t        |||      \  }}t        j                  d| |       t        |dd      }	|	j	                  t
              }
dddd	}| |vr1t        d
| dt        t        |j                                           t        |||          } |
|||||      \  }}}t        d      }t        |j                  t        |      d      ||||      S )a  Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).

    Download it if necessary.

    =================   =======================
    Classes                                   2
    Samples total                         13233
    Dimensionality                         5828
    Features            real, between 0 and 255
    =================   =======================

    In the official `README.txt`_ this task is described as the
    "Restricted" task.  As I am not sure as to implement the
    "Unrestricted" variant correctly, I left it as unsupported for now.

      .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt

    The original images are 250 x 250 pixels, but the default slice and resize
    arguments reduce them to 62 x 47.

    Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`.

    Parameters
    ----------
    subset : {'train', 'test', '10_folds'}, default='train'
        Select the dataset to load: 'train' for the development training
        set, 'test' for the development test set, and '10_folds' for the
        official evaluation set that is meant to be used with a 10-folds
        cross validation.

    data_home : str or path-like, default=None
        Specify another download and cache folder for the datasets. By
        default all scikit-learn data is stored in '~/scikit_learn_data'
        subfolders.

    funneled : bool, default=True
        Download and use the funneled variant of the dataset.

    resize : float, default=0.5
        Ratio used to resize the each face picture.

    color : bool, default=False
        Keep the 3 RGB channels instead of averaging them to a single
        gray level channel. If color is True the shape of the data has
        one more dimension than the shape with color = False.

    slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))
        Provide a custom 2D slice (height, width) to extract the
        'interesting' part of the jpeg files and avoid use statistical
        correlation from the background.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    -------
    data : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : ndarray of shape (2200, 5828). Shape depends on ``subset``.
            Each row corresponds to 2 ravel'd face images
            of original size 62 x 47 pixels.
            Changing the ``slice_``, ``resize`` or ``subset`` parameters
            will change the shape of the output.
        pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset``
            Each row has 2 face images corresponding
            to same or different person from the dataset
            containing 5749 people. Changing the ``slice_``,
            ``resize`` or ``subset`` parameters will change the shape of the
            output.
        target : numpy array of shape (2200,). Shape depends on ``subset``.
            Labels associated to each pair of images.
            The two label values being different persons or the same person.
        target_names : numpy array of shape (2,)
            Explains the target values of the target array.
            0 corresponds to "Different person", 1 corresponds to "same person".
        DESCR : str
            Description of the Labeled Faces in the Wild (LFW) dataset.
    r   zLoading %s LFW pairs from %sr   r   r   r   r   r!   )r   r   r   zsubset='z' is invalid: should be one of )r[   rb   ra   r   r   )r   r   r6   r   r   )r<   r+   r1   r   r   r   rv   r   rs   keysr   r   r   r   rU   )r   r$   r4   r[   rb   ra   r5   r%   r8   r   r   label_filenamesr   r   r6   r   r   s                    r;   fetch_lfw_pairsr     s    L "2hDW"H LL/B 	1a8A()I %"O
 _$tF?#7#7#9:;=
 	
 8_V%<=O #,)&f#E6< 	"F ]]3u:r*! r=   )NTT)NFNr   )NFN)-__doc__loggingnumbersr   r   osr   r   r   r   os.pathr	   r
   r   numpyrV   joblibr   utilsr   utils._param_validationr   r   r   r   _baser   r   r   r   	getLogger__name__r+   r/   r.   r*   r<   rn   r   r   rN   rM   r   r   r   rB   r=   r;   <module>r      s    " 2 2 ' '    S S  
		8	$ 8O &8O  $<S
 #<S
 <S2(&VBV ST('V 8T*KD!T)<dC!)(AtF!KT R&,' ){ k	 #' 
"cNE"cN+{{H IM1Ih ;<=8T*KD!T)<dC&,' ){ #' 
"cNE"cN+r=   