1188 lines
40 KiB
Python
1188 lines
40 KiB
Python
"""
|
|
Base IO code for all datasets
|
|
"""
|
|
|
|
# Copyright (c) 2007 David Cournapeau <cournape@gmail.com>
|
|
# 2010 Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
|
# 2010 Olivier Grisel <olivier.grisel@ensta.org>
|
|
# License: BSD 3 clause
|
|
import os
|
|
import csv
|
|
import shutil
|
|
from collections import namedtuple
|
|
from os import environ, listdir, makedirs
|
|
from os.path import dirname, exists, expanduser, isdir, join, splitext
|
|
import hashlib
|
|
|
|
from ..utils import Bunch
|
|
from ..utils import check_random_state
|
|
from ..utils import check_pandas_support
|
|
from ..utils.validation import _deprecate_positional_args
|
|
|
|
import numpy as np
|
|
|
|
from urllib.request import urlretrieve
|
|
|
|
RemoteFileMetadata = namedtuple('RemoteFileMetadata',
|
|
['filename', 'url', 'checksum'])
|
|
|
|
|
|
def get_data_home(data_home=None):
|
|
"""Return the path of the scikit-learn data dir.
|
|
|
|
This folder is used by some large dataset loaders to avoid downloading the
|
|
data several times.
|
|
|
|
By default the data dir is set to a folder named 'scikit_learn_data' in the
|
|
user home folder.
|
|
|
|
Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment
|
|
variable or programmatically by giving an explicit folder path. The '~'
|
|
symbol is expanded to the user home folder.
|
|
|
|
If the folder does not already exist, it is automatically created.
|
|
|
|
Parameters
|
|
----------
|
|
data_home : str | None
|
|
The path to scikit-learn data dir.
|
|
"""
|
|
if data_home is None:
|
|
data_home = environ.get('SCIKIT_LEARN_DATA',
|
|
join('~', 'scikit_learn_data'))
|
|
data_home = expanduser(data_home)
|
|
if not exists(data_home):
|
|
makedirs(data_home)
|
|
return data_home
|
|
|
|
|
|
def clear_data_home(data_home=None):
|
|
"""Delete all the content of the data home cache.
|
|
|
|
Parameters
|
|
----------
|
|
data_home : str | None
|
|
The path to scikit-learn data dir.
|
|
"""
|
|
data_home = get_data_home(data_home)
|
|
shutil.rmtree(data_home)
|
|
|
|
|
|
def _convert_data_dataframe(caller_name, data, target,
|
|
feature_names, target_names):
|
|
pd = check_pandas_support('{} with as_frame=True'.format(caller_name))
|
|
data_df = pd.DataFrame(data, columns=feature_names)
|
|
target_df = pd.DataFrame(target, columns=target_names)
|
|
combined_df = pd.concat([data_df, target_df], axis=1)
|
|
X = combined_df[feature_names]
|
|
y = combined_df[target_names]
|
|
if y.shape[1] == 1:
|
|
y = y.iloc[:, 0]
|
|
return combined_df, X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def load_files(container_path, *, description=None, categories=None,
|
|
load_content=True, shuffle=True, encoding=None,
|
|
decode_error='strict', random_state=0):
|
|
"""Load text files with categories as subfolder names.
|
|
|
|
Individual samples are assumed to be files stored a two levels folder
|
|
structure such as the following:
|
|
|
|
container_folder/
|
|
category_1_folder/
|
|
file_1.txt
|
|
file_2.txt
|
|
...
|
|
file_42.txt
|
|
category_2_folder/
|
|
file_43.txt
|
|
file_44.txt
|
|
...
|
|
|
|
The folder names are used as supervised signal label names. The individual
|
|
file names are not important.
|
|
|
|
This function does not try to extract features into a numpy array or scipy
|
|
sparse matrix. In addition, if load_content is false it does not try to
|
|
load the files in memory.
|
|
|
|
To use text files in a scikit-learn classification or clustering algorithm,
|
|
you will need to use the :mod`~sklearn.feature_extraction.text` module to
|
|
build a feature extraction transformer that suits your problem.
|
|
|
|
If you set load_content=True, you should also specify the encoding of the
|
|
text using the 'encoding' parameter. For many modern text files, 'utf-8'
|
|
will be the correct encoding. If you leave encoding equal to None, then the
|
|
content will be made of bytes instead of Unicode, and you will not be able
|
|
to use most functions in :mod:`~sklearn.feature_extraction.text`.
|
|
|
|
Similar feature extractors should be built for other kind of unstructured
|
|
data input such as images, audio, video, ...
|
|
|
|
Read more in the :ref:`User Guide <datasets>`.
|
|
|
|
Parameters
|
|
----------
|
|
container_path : string or unicode
|
|
Path to the main folder holding one subfolder per category
|
|
|
|
description : string or unicode, optional (default=None)
|
|
A paragraph describing the characteristic of the dataset: its source,
|
|
reference, etc.
|
|
|
|
categories : A collection of strings or None, optional (default=None)
|
|
If None (default), load all the categories. If not None, list of
|
|
category names to load (other categories ignored).
|
|
|
|
load_content : bool, optional (default=True)
|
|
Whether to load or not the content of the different files. If true a
|
|
'data' attribute containing the text information is present in the data
|
|
structure returned. If not, a filenames attribute gives the path to the
|
|
files.
|
|
|
|
shuffle : bool, optional (default=True)
|
|
Whether or not to shuffle the data: might be important for models that
|
|
make the assumption that the samples are independent and identically
|
|
distributed (i.i.d.), such as stochastic gradient descent.
|
|
|
|
encoding : string or None (default is None)
|
|
If None, do not try to decode the content of the files (e.g. for images
|
|
or other non-text content). If not None, encoding to use to decode text
|
|
files to Unicode if load_content is True.
|
|
|
|
decode_error : {'strict', 'ignore', 'replace'}, optional
|
|
Instruction on what to do if a byte sequence is given to analyze that
|
|
contains characters not of the given `encoding`. Passed as keyword
|
|
argument 'errors' to bytes.decode.
|
|
|
|
random_state : int, RandomState instance or None, default=0
|
|
Determines random number generation for dataset shuffling. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
data : :class:`~sklearn.utils.Bunch`
|
|
Dictionary-like object, with the following attributes.
|
|
|
|
data : list of str
|
|
Only present when `load_content=True`.
|
|
The raw text data to learn.
|
|
target : ndarray
|
|
The target labels (integer index).
|
|
target_names : list
|
|
The names of target classes.
|
|
DESCR : str
|
|
The full description of the dataset.
|
|
filenames: ndarray
|
|
The filenames holding the dataset.
|
|
"""
|
|
target = []
|
|
target_names = []
|
|
filenames = []
|
|
|
|
folders = [f for f in sorted(listdir(container_path))
|
|
if isdir(join(container_path, f))]
|
|
|
|
if categories is not None:
|
|
folders = [f for f in folders if f in categories]
|
|
|
|
for label, folder in enumerate(folders):
|
|
target_names.append(folder)
|
|
folder_path = join(container_path, folder)
|
|
documents = [join(folder_path, d)
|
|
for d in sorted(listdir(folder_path))]
|
|
target.extend(len(documents) * [label])
|
|
filenames.extend(documents)
|
|
|
|
# convert to array for fancy indexing
|
|
filenames = np.array(filenames)
|
|
target = np.array(target)
|
|
|
|
if shuffle:
|
|
random_state = check_random_state(random_state)
|
|
indices = np.arange(filenames.shape[0])
|
|
random_state.shuffle(indices)
|
|
filenames = filenames[indices]
|
|
target = target[indices]
|
|
|
|
if load_content:
|
|
data = []
|
|
for filename in filenames:
|
|
with open(filename, 'rb') as f:
|
|
data.append(f.read())
|
|
if encoding is not None:
|
|
data = [d.decode(encoding, decode_error) for d in data]
|
|
return Bunch(data=data,
|
|
filenames=filenames,
|
|
target_names=target_names,
|
|
target=target,
|
|
DESCR=description)
|
|
|
|
return Bunch(filenames=filenames,
|
|
target_names=target_names,
|
|
target=target,
|
|
DESCR=description)
|
|
|
|
|
|
def load_data(module_path, data_file_name):
|
|
"""Loads data from module_path/data/data_file_name.
|
|
|
|
Parameters
|
|
----------
|
|
module_path : string
|
|
The module path.
|
|
|
|
data_file_name : string
|
|
Name of csv file to be loaded from
|
|
module_path/data/data_file_name. For example 'wine_data.csv'.
|
|
|
|
Returns
|
|
-------
|
|
data : Numpy array
|
|
A 2D array with each row representing one sample and each column
|
|
representing the features of a given sample.
|
|
|
|
target : Numpy array
|
|
A 1D array holding target variables for all the samples in `data.
|
|
For example target[0] is the target varible for data[0].
|
|
|
|
target_names : Numpy array
|
|
A 1D array containing the names of the classifications. For example
|
|
target_names[0] is the name of the target[0] class.
|
|
"""
|
|
with open(join(module_path, 'data', data_file_name)) as csv_file:
|
|
data_file = csv.reader(csv_file)
|
|
temp = next(data_file)
|
|
n_samples = int(temp[0])
|
|
n_features = int(temp[1])
|
|
target_names = np.array(temp[2:])
|
|
data = np.empty((n_samples, n_features))
|
|
target = np.empty((n_samples,), dtype=np.int)
|
|
|
|
for i, ir in enumerate(data_file):
|
|
data[i] = np.asarray(ir[:-1], dtype=np.float64)
|
|
target[i] = np.asarray(ir[-1], dtype=np.int)
|
|
|
|
return data, target, target_names
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def load_wine(*, return_X_y=False, as_frame=False):
|
|
"""Load and return the wine dataset (classification).
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
The wine dataset is a classic and very easy multi-class classification
|
|
dataset.
|
|
|
|
================= ==============
|
|
Classes 3
|
|
Samples per class [59,71,48]
|
|
Samples total 178
|
|
Dimensionality 13
|
|
Features real, positive
|
|
================= ==============
|
|
|
|
Read more in the :ref:`User Guide <wine_dataset>`.
|
|
|
|
Parameters
|
|
----------
|
|
return_X_y : bool, default=False.
|
|
If True, returns ``(data, target)`` instead of a Bunch object.
|
|
See below for more information about the `data` and `target` object.
|
|
|
|
as_frame : bool, default=False
|
|
If True, the data is a pandas DataFrame including columns with
|
|
appropriate dtypes (numeric). The target is
|
|
a pandas DataFrame or Series depending on the number of target columns.
|
|
If `return_X_y` is True, then (`data`, `target`) will be pandas
|
|
DataFrames or Series as described below.
|
|
|
|
.. versionadded:: 0.23
|
|
|
|
Returns
|
|
-------
|
|
data : :class:`~sklearn.utils.Bunch`
|
|
Dictionary-like object, with the following attributes.
|
|
|
|
data : {ndarray, dataframe} of shape (178, 13)
|
|
The data matrix. If `as_frame=True`, `data` will be a pandas
|
|
DataFrame.
|
|
target: {ndarray, Series} of shape (178,)
|
|
The classification target. If `as_frame=True`, `target` will be
|
|
a pandas Series.
|
|
feature_names: list
|
|
The names of the dataset columns.
|
|
target_names: list
|
|
The names of target classes.
|
|
frame: DataFrame of shape (178, 14)
|
|
Only present when `as_frame=True`. DataFrame with `data` and
|
|
`target`.
|
|
|
|
.. versionadded:: 0.23
|
|
DESCR: str
|
|
The full description of the dataset.
|
|
|
|
(data, target) : tuple if ``return_X_y`` is True
|
|
|
|
The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit
|
|
standard format from:
|
|
https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
|
|
|
|
Examples
|
|
--------
|
|
Let's say you are interested in the samples 10, 80, and 140, and want to
|
|
know their class name.
|
|
|
|
>>> from sklearn.datasets import load_wine
|
|
>>> data = load_wine()
|
|
>>> data.target[[10, 80, 140]]
|
|
array([0, 1, 2])
|
|
>>> list(data.target_names)
|
|
['class_0', 'class_1', 'class_2']
|
|
"""
|
|
module_path = dirname(__file__)
|
|
data, target, target_names = load_data(module_path, 'wine_data.csv')
|
|
|
|
with open(join(module_path, 'descr', 'wine_data.rst')) as rst_file:
|
|
fdescr = rst_file.read()
|
|
|
|
feature_names = ['alcohol',
|
|
'malic_acid',
|
|
'ash',
|
|
'alcalinity_of_ash',
|
|
'magnesium',
|
|
'total_phenols',
|
|
'flavanoids',
|
|
'nonflavanoid_phenols',
|
|
'proanthocyanins',
|
|
'color_intensity',
|
|
'hue',
|
|
'od280/od315_of_diluted_wines',
|
|
'proline']
|
|
|
|
frame = None
|
|
target_columns = ['target', ]
|
|
if as_frame:
|
|
frame, data, target = _convert_data_dataframe("load_wine",
|
|
data,
|
|
target,
|
|
feature_names,
|
|
target_columns)
|
|
|
|
if return_X_y:
|
|
return data, target
|
|
|
|
return Bunch(data=data,
|
|
target=target,
|
|
frame=frame,
|
|
target_names=target_names,
|
|
DESCR=fdescr,
|
|
feature_names=feature_names)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def load_iris(*, return_X_y=False, as_frame=False):
|
|
"""Load and return the iris dataset (classification).
|
|
|
|
The iris dataset is a classic and very easy multi-class classification
|
|
dataset.
|
|
|
|
================= ==============
|
|
Classes 3
|
|
Samples per class 50
|
|
Samples total 150
|
|
Dimensionality 4
|
|
Features real, positive
|
|
================= ==============
|
|
|
|
Read more in the :ref:`User Guide <iris_dataset>`.
|
|
|
|
Parameters
|
|
----------
|
|
return_X_y : bool, default=False.
|
|
If True, returns ``(data, target)`` instead of a Bunch object. See
|
|
below for more information about the `data` and `target` object.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
as_frame : bool, default=False
|
|
If True, the data is a pandas DataFrame including columns with
|
|
appropriate dtypes (numeric). The target is
|
|
a pandas DataFrame or Series depending on the number of target columns.
|
|
If `return_X_y` is True, then (`data`, `target`) will be pandas
|
|
DataFrames or Series as described below.
|
|
|
|
.. versionadded:: 0.23
|
|
|
|
Returns
|
|
-------
|
|
data : :class:`~sklearn.utils.Bunch`
|
|
Dictionary-like object, with the following attributes.
|
|
|
|
data : {ndarray, dataframe} of shape (150, 4)
|
|
The data matrix. If `as_frame=True`, `data` will be a pandas
|
|
DataFrame.
|
|
target: {ndarray, Series} of shape (150,)
|
|
The classification target. If `as_frame=True`, `target` will be
|
|
a pandas Series.
|
|
feature_names: list
|
|
The names of the dataset columns.
|
|
target_names: list
|
|
The names of target classes.
|
|
frame: DataFrame of shape (150, 5)
|
|
Only present when `as_frame=True`. DataFrame with `data` and
|
|
`target`.
|
|
|
|
.. versionadded:: 0.23
|
|
DESCR: str
|
|
The full description of the dataset.
|
|
filename: str
|
|
The path to the location of the data.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
(data, target) : tuple if ``return_X_y`` is True
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Notes
|
|
-----
|
|
.. versionchanged:: 0.20
|
|
Fixed two wrong data points according to Fisher's paper.
|
|
The new version is the same as in R, but not as in the UCI
|
|
Machine Learning Repository.
|
|
|
|
Examples
|
|
--------
|
|
Let's say you are interested in the samples 10, 25, and 50, and want to
|
|
know their class name.
|
|
|
|
>>> from sklearn.datasets import load_iris
|
|
>>> data = load_iris()
|
|
>>> data.target[[10, 25, 50]]
|
|
array([0, 0, 1])
|
|
>>> list(data.target_names)
|
|
['setosa', 'versicolor', 'virginica']
|
|
"""
|
|
module_path = dirname(__file__)
|
|
data, target, target_names = load_data(module_path, 'iris.csv')
|
|
iris_csv_filename = join(module_path, 'data', 'iris.csv')
|
|
|
|
with open(join(module_path, 'descr', 'iris.rst')) as rst_file:
|
|
fdescr = rst_file.read()
|
|
|
|
feature_names = ['sepal length (cm)', 'sepal width (cm)',
|
|
'petal length (cm)', 'petal width (cm)']
|
|
|
|
frame = None
|
|
target_columns = ['target', ]
|
|
if as_frame:
|
|
frame, data, target = _convert_data_dataframe("load_iris",
|
|
data,
|
|
target,
|
|
feature_names,
|
|
target_columns)
|
|
|
|
if return_X_y:
|
|
return data, target
|
|
|
|
return Bunch(data=data,
|
|
target=target,
|
|
frame=frame,
|
|
target_names=target_names,
|
|
DESCR=fdescr,
|
|
feature_names=feature_names,
|
|
filename=iris_csv_filename)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def load_breast_cancer(*, return_X_y=False, as_frame=False):
|
|
"""Load and return the breast cancer wisconsin dataset (classification).
|
|
|
|
The breast cancer dataset is a classic and very easy binary classification
|
|
dataset.
|
|
|
|
================= ==============
|
|
Classes 2
|
|
Samples per class 212(M),357(B)
|
|
Samples total 569
|
|
Dimensionality 30
|
|
Features real, positive
|
|
================= ==============
|
|
|
|
Read more in the :ref:`User Guide <breast_cancer_dataset>`.
|
|
|
|
Parameters
|
|
----------
|
|
return_X_y : bool, default=False
|
|
If True, returns ``(data, target)`` instead of a Bunch object.
|
|
See below for more information about the `data` and `target` object.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
as_frame : bool, default=False
|
|
If True, the data is a pandas DataFrame including columns with
|
|
appropriate dtypes (numeric). The target is
|
|
a pandas DataFrame or Series depending on the number of target columns.
|
|
If `return_X_y` is True, then (`data`, `target`) will be pandas
|
|
DataFrames or Series as described below.
|
|
|
|
.. versionadded:: 0.23
|
|
|
|
Returns
|
|
-------
|
|
data : :class:`~sklearn.utils.Bunch`
|
|
Dictionary-like object, with the following attributes.
|
|
|
|
data : {ndarray, dataframe} of shape (569, 30)
|
|
The data matrix. If `as_frame=True`, `data` will be a pandas
|
|
DataFrame.
|
|
target: {ndarray, Series} of shape (569,)
|
|
The classification target. If `as_frame=True`, `target` will be
|
|
a pandas Series.
|
|
feature_names: list
|
|
The names of the dataset columns.
|
|
target_names: list
|
|
The names of target classes.
|
|
frame: DataFrame of shape (569, 31)
|
|
Only present when `as_frame=True`. DataFrame with `data` and
|
|
`target`.
|
|
|
|
.. versionadded:: 0.23
|
|
DESCR: str
|
|
The full description of the dataset.
|
|
filename: str
|
|
The path to the location of the data.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
(data, target) : tuple if ``return_X_y`` is True
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is
|
|
downloaded from:
|
|
https://goo.gl/U2Uwz2
|
|
|
|
Examples
|
|
--------
|
|
Let's say you are interested in the samples 10, 50, and 85, and want to
|
|
know their class name.
|
|
|
|
>>> from sklearn.datasets import load_breast_cancer
|
|
>>> data = load_breast_cancer()
|
|
>>> data.target[[10, 50, 85]]
|
|
array([0, 1, 0])
|
|
>>> list(data.target_names)
|
|
['malignant', 'benign']
|
|
"""
|
|
module_path = dirname(__file__)
|
|
data, target, target_names = load_data(module_path, 'breast_cancer.csv')
|
|
csv_filename = join(module_path, 'data', 'breast_cancer.csv')
|
|
|
|
with open(join(module_path, 'descr', 'breast_cancer.rst')) as rst_file:
|
|
fdescr = rst_file.read()
|
|
|
|
feature_names = np.array(['mean radius', 'mean texture',
|
|
'mean perimeter', 'mean area',
|
|
'mean smoothness', 'mean compactness',
|
|
'mean concavity', 'mean concave points',
|
|
'mean symmetry', 'mean fractal dimension',
|
|
'radius error', 'texture error',
|
|
'perimeter error', 'area error',
|
|
'smoothness error', 'compactness error',
|
|
'concavity error', 'concave points error',
|
|
'symmetry error', 'fractal dimension error',
|
|
'worst radius', 'worst texture',
|
|
'worst perimeter', 'worst area',
|
|
'worst smoothness', 'worst compactness',
|
|
'worst concavity', 'worst concave points',
|
|
'worst symmetry', 'worst fractal dimension'])
|
|
|
|
frame = None
|
|
target_columns = ['target', ]
|
|
if as_frame:
|
|
frame, data, target = _convert_data_dataframe("load_breast_cancer",
|
|
data,
|
|
target,
|
|
feature_names,
|
|
target_columns)
|
|
|
|
if return_X_y:
|
|
return data, target
|
|
|
|
return Bunch(data=data,
|
|
target=target,
|
|
frame=frame,
|
|
target_names=target_names,
|
|
DESCR=fdescr,
|
|
feature_names=feature_names,
|
|
filename=csv_filename)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
|
|
"""Load and return the digits dataset (classification).
|
|
|
|
Each datapoint is a 8x8 image of a digit.
|
|
|
|
================= ==============
|
|
Classes 10
|
|
Samples per class ~180
|
|
Samples total 1797
|
|
Dimensionality 64
|
|
Features integers 0-16
|
|
================= ==============
|
|
|
|
Read more in the :ref:`User Guide <digits_dataset>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_class : integer, between 0 and 10, optional (default=10)
|
|
The number of classes to return.
|
|
|
|
return_X_y : bool, default=False.
|
|
If True, returns ``(data, target)`` instead of a Bunch object.
|
|
See below for more information about the `data` and `target` object.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
as_frame : bool, default=False
|
|
If True, the data is a pandas DataFrame including columns with
|
|
appropriate dtypes (numeric). The target is
|
|
a pandas DataFrame or Series depending on the number of target columns.
|
|
If `return_X_y` is True, then (`data`, `target`) will be pandas
|
|
DataFrames or Series as described below.
|
|
|
|
.. versionadded:: 0.23
|
|
|
|
Returns
|
|
-------
|
|
data : :class:`~sklearn.utils.Bunch`
|
|
Dictionary-like object, with the following attributes.
|
|
|
|
data : {ndarray, dataframe} of shape (1797, 64)
|
|
The flattened data matrix. If `as_frame=True`, `data` will be
|
|
a pandas DataFrame.
|
|
target: {ndarray, Series} of shape (1797,)
|
|
The classification target. If `as_frame=True`, `target` will be
|
|
a pandas Series.
|
|
feature_names: list
|
|
The names of the dataset columns.
|
|
target_names: list
|
|
The names of target classes.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
frame: DataFrame of shape (1797, 65)
|
|
Only present when `as_frame=True`. DataFrame with `data` and
|
|
`target`.
|
|
|
|
.. versionadded:: 0.23
|
|
images: {ndarray} of shape (1797, 8, 8)
|
|
The raw image data.
|
|
DESCR: str
|
|
The full description of the dataset.
|
|
|
|
(data, target) : tuple if ``return_X_y`` is True
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
This is a copy of the test set of the UCI ML hand-written digits datasets
|
|
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
|
|
|
|
Examples
|
|
--------
|
|
To load the data and visualize the images::
|
|
|
|
>>> from sklearn.datasets import load_digits
|
|
>>> digits = load_digits()
|
|
>>> print(digits.data.shape)
|
|
(1797, 64)
|
|
>>> import matplotlib.pyplot as plt #doctest: +SKIP
|
|
>>> plt.gray() #doctest: +SKIP
|
|
>>> plt.matshow(digits.images[0]) #doctest: +SKIP
|
|
>>> plt.show() #doctest: +SKIP
|
|
"""
|
|
module_path = dirname(__file__)
|
|
data = np.loadtxt(join(module_path, 'data', 'digits.csv.gz'),
|
|
delimiter=',')
|
|
with open(join(module_path, 'descr', 'digits.rst')) as f:
|
|
descr = f.read()
|
|
target = data[:, -1].astype(np.int, copy=False)
|
|
flat_data = data[:, :-1]
|
|
images = flat_data.view()
|
|
images.shape = (-1, 8, 8)
|
|
|
|
if n_class < 10:
|
|
idx = target < n_class
|
|
flat_data, target = flat_data[idx], target[idx]
|
|
images = images[idx]
|
|
|
|
feature_names = ['pixel_{}_{}'.format(row_idx, col_idx)
|
|
for row_idx in range(8)
|
|
for col_idx in range(8)]
|
|
|
|
frame = None
|
|
target_columns = ['target', ]
|
|
if as_frame:
|
|
frame, flat_data, target = _convert_data_dataframe("load_digits",
|
|
flat_data,
|
|
target,
|
|
feature_names,
|
|
target_columns)
|
|
|
|
if return_X_y:
|
|
return flat_data, target
|
|
|
|
return Bunch(data=flat_data,
|
|
target=target,
|
|
frame=frame,
|
|
feature_names=feature_names,
|
|
target_names=np.arange(10),
|
|
images=images,
|
|
DESCR=descr)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def load_diabetes(*, return_X_y=False, as_frame=False):
|
|
"""Load and return the diabetes dataset (regression).
|
|
|
|
============== ==================
|
|
Samples total 442
|
|
Dimensionality 10
|
|
Features real, -.2 < x < .2
|
|
Targets integer 25 - 346
|
|
============== ==================
|
|
|
|
Read more in the :ref:`User Guide <diabetes_dataset>`.
|
|
|
|
Parameters
|
|
----------
|
|
return_X_y : bool, default=False.
|
|
If True, returns ``(data, target)`` instead of a Bunch object.
|
|
See below for more information about the `data` and `target` object.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
as_frame : bool, default=False
|
|
If True, the data is a pandas DataFrame including columns with
|
|
appropriate dtypes (numeric). The target is
|
|
a pandas DataFrame or Series depending on the number of target columns.
|
|
If `return_X_y` is True, then (`data`, `target`) will be pandas
|
|
DataFrames or Series as described below.
|
|
|
|
.. versionadded:: 0.23
|
|
|
|
Returns
|
|
-------
|
|
data : :class:`~sklearn.utils.Bunch`
|
|
Dictionary-like object, with the following attributes.
|
|
|
|
data : {ndarray, dataframe} of shape (442, 10)
|
|
The data matrix. If `as_frame=True`, `data` will be a pandas
|
|
DataFrame.
|
|
target: {ndarray, Series} of shape (442,)
|
|
The regression target. If `as_frame=True`, `target` will be
|
|
a pandas Series.
|
|
feature_names: list
|
|
The names of the dataset columns.
|
|
frame: DataFrame of shape (442, 11)
|
|
Only present when `as_frame=True`. DataFrame with `data` and
|
|
`target`.
|
|
|
|
.. versionadded:: 0.23
|
|
DESCR: str
|
|
The full description of the dataset.
|
|
data_filename: str
|
|
The path to the location of the data.
|
|
target_filename: str
|
|
The path to the location of the target.
|
|
|
|
(data, target) : tuple if ``return_X_y`` is True
|
|
|
|
.. versionadded:: 0.18
|
|
"""
|
|
module_path = dirname(__file__)
|
|
base_dir = join(module_path, 'data')
|
|
data_filename = join(base_dir, 'diabetes_data.csv.gz')
|
|
data = np.loadtxt(data_filename)
|
|
target_filename = join(base_dir, 'diabetes_target.csv.gz')
|
|
target = np.loadtxt(target_filename)
|
|
|
|
with open(join(module_path, 'descr', 'diabetes.rst')) as rst_file:
|
|
fdescr = rst_file.read()
|
|
|
|
feature_names = ['age', 'sex', 'bmi', 'bp',
|
|
's1', 's2', 's3', 's4', 's5', 's6']
|
|
|
|
frame = None
|
|
target_columns = ['target', ]
|
|
if as_frame:
|
|
frame, data, target = _convert_data_dataframe("load_diabetes",
|
|
data,
|
|
target,
|
|
feature_names,
|
|
target_columns)
|
|
|
|
if return_X_y:
|
|
return data, target
|
|
|
|
return Bunch(data=data,
|
|
target=target,
|
|
frame=frame,
|
|
DESCR=fdescr,
|
|
feature_names=feature_names,
|
|
data_filename=data_filename,
|
|
target_filename=target_filename)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def load_linnerud(*, return_X_y=False, as_frame=False):
|
|
"""Load and return the physical excercise linnerud dataset.
|
|
|
|
This dataset is suitable for multi-ouput regression tasks.
|
|
|
|
============== ============================
|
|
Samples total 20
|
|
Dimensionality 3 (for both data and target)
|
|
Features integer
|
|
Targets integer
|
|
============== ============================
|
|
|
|
Read more in the :ref:`User Guide <linnerrud_dataset>`.
|
|
|
|
Parameters
|
|
----------
|
|
return_X_y : bool, default=False.
|
|
If True, returns ``(data, target)`` instead of a Bunch object.
|
|
See below for more information about the `data` and `target` object.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
as_frame : bool, default=False
|
|
If True, the data is a pandas DataFrame including columns with
|
|
appropriate dtypes (numeric, string or categorical). The target is
|
|
a pandas DataFrame or Series depending on the number of target columns.
|
|
If `return_X_y` is True, then (`data`, `target`) will be pandas
|
|
DataFrames or Series as described below.
|
|
|
|
.. versionadded:: 0.23
|
|
|
|
Returns
|
|
-------
|
|
data : :class:`~sklearn.utils.Bunch`
|
|
Dictionary-like object, with the following attributes.
|
|
|
|
data : {ndarray, dataframe} of shape (20, 3)
|
|
The data matrix. If `as_frame=True`, `data` will be a pandas
|
|
DataFrame.
|
|
target: {ndarray, dataframe} of shape (20, 3)
|
|
The regression targets. If `as_frame=True`, `target` will be
|
|
a pandas DataFrame.
|
|
feature_names: list
|
|
The names of the dataset columns.
|
|
target_names: list
|
|
The names of the target columns.
|
|
frame: DataFrame of shape (20, 6)
|
|
Only present when `as_frame=True`. DataFrame with `data` and
|
|
`target`.
|
|
|
|
.. versionadded:: 0.23
|
|
DESCR: str
|
|
The full description of the dataset.
|
|
data_filename: str
|
|
The path to the location of the data.
|
|
target_filename: str
|
|
The path to the location of the target.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
(data, target) : tuple if ``return_X_y`` is True
|
|
|
|
.. versionadded:: 0.18
|
|
"""
|
|
base_dir = join(dirname(__file__), 'data/')
|
|
data_filename = join(base_dir, 'linnerud_exercise.csv')
|
|
target_filename = join(base_dir, 'linnerud_physiological.csv')
|
|
|
|
# Read data
|
|
data_exercise = np.loadtxt(data_filename, skiprows=1)
|
|
data_physiological = np.loadtxt(target_filename, skiprows=1)
|
|
|
|
# Read header
|
|
with open(data_filename) as f:
|
|
header_exercise = f.readline().split()
|
|
with open(target_filename) as f:
|
|
header_physiological = f.readline().split()
|
|
|
|
with open(dirname(__file__) + '/descr/linnerud.rst') as f:
|
|
descr = f.read()
|
|
|
|
frame = None
|
|
if as_frame:
|
|
(frame,
|
|
data_exercise,
|
|
data_physiological) = _convert_data_dataframe("load_linnerud",
|
|
data_exercise,
|
|
data_physiological,
|
|
header_exercise,
|
|
header_physiological)
|
|
if return_X_y:
|
|
return data_exercise, data_physiological
|
|
|
|
return Bunch(data=data_exercise,
|
|
feature_names=header_exercise,
|
|
target=data_physiological,
|
|
target_names=header_physiological,
|
|
frame=frame,
|
|
DESCR=descr,
|
|
data_filename=data_filename,
|
|
target_filename=target_filename)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def load_boston(*, return_X_y=False):
|
|
"""Load and return the boston house-prices dataset (regression).
|
|
|
|
============== ==============
|
|
Samples total 506
|
|
Dimensionality 13
|
|
Features real, positive
|
|
Targets real 5. - 50.
|
|
============== ==============
|
|
|
|
Read more in the :ref:`User Guide <boston_dataset>`.
|
|
|
|
Parameters
|
|
----------
|
|
return_X_y : bool, default=False.
|
|
If True, returns ``(data, target)`` instead of a Bunch object.
|
|
See below for more information about the `data` and `target` object.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Returns
|
|
-------
|
|
data : :class:`~sklearn.utils.Bunch`
|
|
Dictionary-like object, with the following attributes.
|
|
|
|
data : ndarray of shape (506, 13)
|
|
The data matrix.
|
|
target : ndarray of shape (506, )
|
|
The regression target.
|
|
filename : str
|
|
The physical location of boston csv dataset.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
DESCR : str
|
|
The full description of the dataset.
|
|
feature_names : ndarray
|
|
The names of features
|
|
|
|
(data, target) : tuple if ``return_X_y`` is True
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Notes
|
|
-----
|
|
.. versionchanged:: 0.20
|
|
Fixed a wrong data point at [445, 0].
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_boston
|
|
>>> X, y = load_boston(return_X_y=True)
|
|
>>> print(X.shape)
|
|
(506, 13)
|
|
"""
|
|
module_path = dirname(__file__)
|
|
|
|
fdescr_name = join(module_path, 'descr', 'boston_house_prices.rst')
|
|
with open(fdescr_name) as f:
|
|
descr_text = f.read()
|
|
|
|
data_file_name = join(module_path, 'data', 'boston_house_prices.csv')
|
|
with open(data_file_name) as f:
|
|
data_file = csv.reader(f)
|
|
temp = next(data_file)
|
|
n_samples = int(temp[0])
|
|
n_features = int(temp[1])
|
|
data = np.empty((n_samples, n_features))
|
|
target = np.empty((n_samples,))
|
|
temp = next(data_file) # names of features
|
|
feature_names = np.array(temp)
|
|
|
|
for i, d in enumerate(data_file):
|
|
data[i] = np.asarray(d[:-1], dtype=np.float64)
|
|
target[i] = np.asarray(d[-1], dtype=np.float64)
|
|
|
|
if return_X_y:
|
|
return data, target
|
|
|
|
return Bunch(data=data,
|
|
target=target,
|
|
# last column is target value
|
|
feature_names=feature_names[:-1],
|
|
DESCR=descr_text,
|
|
filename=data_file_name)
|
|
|
|
|
|
def load_sample_images():
|
|
"""Load sample images for image manipulation.
|
|
|
|
Loads both, ``china`` and ``flower``.
|
|
|
|
Read more in the :ref:`User Guide <sample_images>`.
|
|
|
|
Returns
|
|
-------
|
|
data : :class:`~sklearn.utils.Bunch`
|
|
Dictionary-like object, with the following attributes.
|
|
|
|
images : list of ndarray of shape (427, 640, 3)
|
|
The two sample image.
|
|
filenames : list
|
|
The filenames for the images.
|
|
DESCR : str
|
|
The full description of the dataset.
|
|
|
|
Examples
|
|
--------
|
|
To load the data and visualize the images:
|
|
|
|
>>> from sklearn.datasets import load_sample_images
|
|
>>> dataset = load_sample_images() #doctest: +SKIP
|
|
>>> len(dataset.images) #doctest: +SKIP
|
|
2
|
|
>>> first_img_data = dataset.images[0] #doctest: +SKIP
|
|
>>> first_img_data.shape #doctest: +SKIP
|
|
(427, 640, 3)
|
|
>>> first_img_data.dtype #doctest: +SKIP
|
|
dtype('uint8')
|
|
"""
|
|
# import PIL only when needed
|
|
from ..externals._pilutil import imread
|
|
|
|
module_path = join(dirname(__file__), "images")
|
|
with open(join(module_path, 'README.txt')) as f:
|
|
descr = f.read()
|
|
filenames = [join(module_path, filename)
|
|
for filename in sorted(os.listdir(module_path))
|
|
if filename.endswith(".jpg")]
|
|
# Load image data for each image in the source folder.
|
|
images = [imread(filename) for filename in filenames]
|
|
|
|
return Bunch(images=images,
|
|
filenames=filenames,
|
|
DESCR=descr)
|
|
|
|
|
|
def load_sample_image(image_name):
|
|
"""Load the numpy array of a single sample image
|
|
|
|
Read more in the :ref:`User Guide <sample_images>`.
|
|
|
|
Parameters
|
|
----------
|
|
image_name : {`china.jpg`, `flower.jpg`}
|
|
The name of the sample image loaded
|
|
|
|
Returns
|
|
-------
|
|
img : 3D array
|
|
The image as a numpy array: height x width x color
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> from sklearn.datasets import load_sample_image
|
|
>>> china = load_sample_image('china.jpg') # doctest: +SKIP
|
|
>>> china.dtype # doctest: +SKIP
|
|
dtype('uint8')
|
|
>>> china.shape # doctest: +SKIP
|
|
(427, 640, 3)
|
|
>>> flower = load_sample_image('flower.jpg') # doctest: +SKIP
|
|
>>> flower.dtype # doctest: +SKIP
|
|
dtype('uint8')
|
|
>>> flower.shape # doctest: +SKIP
|
|
(427, 640, 3)
|
|
"""
|
|
images = load_sample_images()
|
|
index = None
|
|
for i, filename in enumerate(images.filenames):
|
|
if filename.endswith(image_name):
|
|
index = i
|
|
break
|
|
if index is None:
|
|
raise AttributeError("Cannot find sample image: %s" % image_name)
|
|
return images.images[index]
|
|
|
|
|
|
def _pkl_filepath(*args, **kwargs):
|
|
"""Return filename for Python 3 pickles
|
|
|
|
args[-1] is expected to be the ".pkl" filename. For compatibility with
|
|
older scikit-learn versions, a suffix is inserted before the extension.
|
|
|
|
_pkl_filepath('/path/to/folder', 'filename.pkl') returns
|
|
'/path/to/folder/filename_py3.pkl'
|
|
|
|
"""
|
|
py3_suffix = kwargs.get("py3_suffix", "_py3")
|
|
basename, ext = splitext(args[-1])
|
|
basename += py3_suffix
|
|
new_args = args[:-1] + (basename + ext,)
|
|
return join(*new_args)
|
|
|
|
|
|
def _sha256(path):
|
|
"""Calculate the sha256 hash of the file at path."""
|
|
sha256hash = hashlib.sha256()
|
|
chunk_size = 8192
|
|
with open(path, "rb") as f:
|
|
while True:
|
|
buffer = f.read(chunk_size)
|
|
if not buffer:
|
|
break
|
|
sha256hash.update(buffer)
|
|
return sha256hash.hexdigest()
|
|
|
|
|
|
def _fetch_remote(remote, dirname=None):
|
|
"""Helper function to download a remote dataset into path
|
|
|
|
Fetch a dataset pointed by remote's url, save into path using remote's
|
|
filename and ensure its integrity based on the SHA256 Checksum of the
|
|
downloaded file.
|
|
|
|
Parameters
|
|
----------
|
|
remote : RemoteFileMetadata
|
|
Named tuple containing remote dataset meta information: url, filename
|
|
and checksum
|
|
|
|
dirname : string
|
|
Directory to save the file to.
|
|
|
|
Returns
|
|
-------
|
|
file_path: string
|
|
Full path of the created file.
|
|
"""
|
|
|
|
file_path = (remote.filename if dirname is None
|
|
else join(dirname, remote.filename))
|
|
urlretrieve(remote.url, file_path)
|
|
checksum = _sha256(file_path)
|
|
if remote.checksum != checksum:
|
|
raise IOError("{} has an SHA256 checksum ({}) "
|
|
"differing from expected ({}), "
|
|
"file may be corrupted.".format(file_path, checksum,
|
|
remote.checksum))
|
|
return file_path
|