Source code for skdatasets.repositories.uci

"""
UCI datasets (https://archive.ics.uci.edu/ml/datasets.html).

@author: David Diaz Vico
@license: MIT
"""
from __future__ import annotations

import sys
from pathlib import Path
from typing import Any, Literal, Optional, Tuple, Union, overload

import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils import Bunch

from .base import fetch_file

BASE_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases'


def _load_csv(
    fname: Path,
    **kwargs: Any,
) -> Tuple[
    np.typing.NDArray[float],
    np.typing.NDArray[Union[float, int, str]],
]:
    """Load a csv with targets in the last column and features in the rest."""
    data = np.genfromtxt(
        fname,
        dtype=str,
        delimiter=',',
        encoding=None,
        **kwargs,
    )
    X = data[:, :-1]
    try:
        X = X.astype(float)
    except ValueError:
        pass

    y = data[:, -1]

    return X, y


def _fetch(
    name: str,
    data_home: Optional[str] = None,
) -> Tuple[
    np.typing.NDArray[float],
    np.typing.NDArray[Union[float, int]],
    Optional[np.typing.NDArray[float]],
    Optional[np.typing.NDArray[Union[float, int]]],
    str,
    np.typing.NDArray[str],
]:
    """Fetch dataset."""
    subfolder = 'uci'
    filename_str = name + '.data'
    url = BASE_URL + '/' + name + '/' + filename_str

    filename = fetch_file(
        dataname=name,
        urlname=url,
        subfolder=subfolder,
        data_home=data_home,
    )
    X, y = _load_csv(filename)
    target_names = None
    ordinal_encoder = OrdinalEncoder(dtype=np.int64)
    if y.dtype.type is np.str_:
        y = ordinal_encoder.fit_transform(y.reshape(-1, 1))[:, 0]
        target_names = ordinal_encoder.categories_[0]
    try:
        filename_str = name + '.test'
        url = BASE_URL + '/' + name + '/' + filename_str
        filename = fetch_file(
            dataname=name,
            urlname=url,
            subfolder=subfolder,
            data_home=data_home,
        )
        X_test: Optional[np.typing.NDArray[float]]
        y_test: Optional[np.typing.NDArray[Union[float, int, str]]]
        X_test, y_test = _load_csv(filename)

        if y.dtype.type is np.str_:
            y_test = ordinal_encoder.transform(y_test.reshape(-1, 1))[:, 0]

    except Exception:
        X_test = None
        y_test = None
    try:
        filename_str = name + '.names'
        url = BASE_URL + '/' + name + '/' + filename_str
        filename = fetch_file(
            dataname=name,
            urlname=url,
            subfolder=subfolder,
            data_home=data_home,
        )
    except Exception:
        filename_str = name + '.info'
        url = BASE_URL + '/' + name + '/' + filename_str
        filename = fetch_file(
            dataname=name,
            urlname=url,
            subfolder=subfolder,
            data_home=data_home,
        )
    with open(filename) as rst_file:
        fdescr = rst_file.read()
    return X, y, X_test, y_test, fdescr, target_names


@overload
def fetch(
    name: str,
    data_home: Optional[str] = None,
    *,
    return_X_y: Literal[False] = False,
) -> Bunch:
    pass


@overload
def fetch(
    name: str,
    data_home: Optional[str] = None,
    *,
    return_X_y: Literal[True],
) -> Tuple[np.typing.NDArray[float], np.typing.NDArray[float]]:
    pass


[docs]def fetch( name: str, data_home: Optional[str] = None, *, return_X_y: bool = False, ) -> Union[ Bunch, Tuple[np.typing.NDArray[float], np.typing.NDArray[float]], ]: """ Fetch UCI dataset. Fetch a UCI dataset by name. More info at https://archive.ics.uci.edu/ml/datasets.html. Parameters ---------- name : string Dataset name. data_home : string or None, default None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders. return_X_y : bool, default=False If True, returns ``(data, target)`` instead of a Bunch object. Returns ------- data : Bunch Dictionary-like object with all the data and metadata. (data, target) : tuple if ``return_X_y`` is True """ X_train, y_train, X_test, y_test, DESCR, target_names = _fetch( name, data_home=data_home, ) if X_test is None or y_test is None: X = X_train y = y_train train_indices = None test_indices = None else: X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) train_indices = list(range(len(X_train))) test_indices = list(range(len(X_train), len(X))) if return_X_y: return X, y return Bunch( data=X, target=y, train_indices=train_indices, validation_indices=[], test_indices=test_indices, inner_cv=None, outer_cv=None, DESCR=DESCR, target_names=target_names, )