"""California housing dataset. The original database is available from StatLib http://lib.stat.cmu.edu/datasets/ The data contains 20,640 observations on 9 variables. This dataset contains the average house value as target variable and the following input variables (features): average income, housing average age, average rooms, average bedrooms, population, average occupation, latitude, and longitude in that order. References ---------- Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions, Statistics and Probability Letters, 33 (1997) 291-297. """ # Authors: Peter Prettenhofer # License: BSD 3 clause import logging import tarfile from os import PathLike, makedirs, remove from os.path import exists import joblib import numpy as np from ..utils import Bunch from ..utils._param_validation import validate_params from . import get_data_home from ._base import ( RemoteFileMetadata, _convert_data_dataframe, _fetch_remote, _pkl_filepath, load_descr, ) # The original data can be found at: # https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz ARCHIVE = RemoteFileMetadata( filename="cal_housing.tgz", url="https://ndownloader.figshare.com/files/5976036", checksum="aaa5c9a6afe2225cc2aed2723682ae403280c4a3695a2ddda4ffb5d8215ea681", ) logger = logging.getLogger(__name__) @validate_params( { "data_home": [str, PathLike, None], "download_if_missing": ["boolean"], "return_X_y": ["boolean"], "as_frame": ["boolean"], }, prefer_skip_nested_validation=True, ) def fetch_california_housing( *, data_home=None, download_if_missing=True, return_X_y=False, as_frame=False ): """Load the California housing dataset (regression). ============== ============== Samples total 20640 Dimensionality 8 Features real Target real 0.15 - 5. ============== ============== Read more in the :ref:`User Guide `. Parameters ---------- data_home : str or path-like, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : bool, default=True If False, raise an OSError if the data is not locally available instead of trying to download the data from the source site. return_X_y : bool, default=False If True, returns ``(data.data, data.target)`` instead of a Bunch object. .. versionadded:: 0.20 as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric, string or categorical). The target is a pandas DataFrame or Series depending on the number of target_columns. .. versionadded:: 0.23 Returns ------- dataset : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : ndarray, shape (20640, 8) Each row corresponding to the 8 feature values in order. If ``as_frame`` is True, ``data`` is a pandas object. target : numpy array of shape (20640,) Each value corresponds to the average house value in units of 100,000. If ``as_frame`` is True, ``target`` is a pandas object. feature_names : list of length 8 Array of ordered feature names used in the dataset. DESCR : str Description of the California housing dataset. frame : pandas DataFrame Only present when `as_frame=True`. DataFrame with ``data`` and ``target``. .. versionadded:: 0.23 (data, target) : tuple if ``return_X_y`` is True A tuple of two ndarray. The first containing a 2D array of shape (n_samples, n_features) with each row representing one sample and each column representing the features. The second ndarray of shape (n_samples,) containing the target samples. .. versionadded:: 0.20 Notes ----- This dataset consists of 20,640 samples and 9 features. Examples -------- >>> from sklearn.datasets import fetch_california_housing >>> housing = fetch_california_housing() >>> print(housing.data.shape, housing.target.shape) (20640, 8) (20640,) >>> print(housing.feature_names[0:6]) ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup'] """ data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) filepath = _pkl_filepath(data_home, "cal_housing.pkz") if not exists(filepath): if not download_if_missing: raise OSError("Data not found and `download_if_missing` is False") logger.info( "Downloading Cal. housing from {} to {}".format(ARCHIVE.url, data_home) ) archive_path = _fetch_remote(ARCHIVE, dirname=data_home) with tarfile.open(mode="r:gz", name=archive_path) as f: cal_housing = np.loadtxt( f.extractfile("CaliforniaHousing/cal_housing.data"), delimiter="," ) # Columns are not in the same order compared to the previous # URL resource on lib.stat.cmu.edu columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0] cal_housing = cal_housing[:, columns_index] joblib.dump(cal_housing, filepath, compress=6) remove(archive_path) else: cal_housing = joblib.load(filepath) feature_names = [ "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude", ] target, data = cal_housing[:, 0], cal_housing[:, 1:] # avg rooms = total rooms / households data[:, 2] /= data[:, 5] # avg bed rooms = total bed rooms / households data[:, 3] /= data[:, 5] # avg occupancy = population / households data[:, 5] = data[:, 4] / data[:, 5] # target in units of 100,000 target = target / 100000.0 descr = load_descr("california_housing.rst") X = data y = target frame = None target_names = [ "MedHouseVal", ] if as_frame: frame, X, y = _convert_data_dataframe( "fetch_california_housing", data, target, feature_names, target_names ) if return_X_y: return X, y return Bunch( data=X, target=y, frame=frame, target_names=target_names, feature_names=feature_names, DESCR=descr, )