Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/datasets/_openml.py
+++ b/venv/Lib/site-packages/sklearn/datasets/_openml.py
@ -0,0 +1,832 @@
+import gzip
+import json
+import os
+import shutil
+from os.path import join
+from warnings import warn
+from contextlib import closing
+from functools import wraps
+import itertools
+from collections.abc import Generator
+from collections import OrderedDict
+from functools import partial
+
+from urllib.request import urlopen, Request
+
+import numpy as np
+import scipy.sparse
+
+from ..externals import _arff
+from . import get_data_home
+from urllib.error import HTTPError
+from ..utils import Bunch
+from ..utils import get_chunk_n_rows
+from ..utils import _chunk_generator
+from ..utils import check_pandas_support  # noqa
+from ..utils.validation import _deprecate_positional_args
+
+__all__ = ['fetch_openml']
+
+_OPENML_PREFIX = "https://openml.org/"
+_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2"
+_DATA_INFO = "api/v1/json/data/{}"
+_DATA_FEATURES = "api/v1/json/data/features/{}"
+_DATA_QUALITIES = "api/v1/json/data/qualities/{}"
+_DATA_FILE = "data/v1/download/{}"
+
+
+def _get_local_path(openml_path, data_home):
+    return os.path.join(data_home, 'openml.org', openml_path + ".gz")
+
+
+def _retry_with_clean_cache(openml_path, data_home):
+    """If the first call to the decorated function fails, the local cached
+    file is removed, and the function is called again. If ``data_home`` is
+    ``None``, then the function is called once.
+    """
+    def decorator(f):
+        @wraps(f)
+        def wrapper(*args, **kw):
+            if data_home is None:
+                return f(*args, **kw)
+            try:
+                return f(*args, **kw)
+            except HTTPError:
+                raise
+            except Exception:
+                warn("Invalid cache, redownloading file", RuntimeWarning)
+                local_path = _get_local_path(openml_path, data_home)
+                if os.path.exists(local_path):
+                    os.unlink(local_path)
+                return f(*args, **kw)
+        return wrapper
+    return decorator
+
+
+def _open_openml_url(openml_path, data_home):
+    """
+    Returns a resource from OpenML.org. Caches it to data_home if required.
+
+    Parameters
+    ----------
+    openml_path : str
+        OpenML URL that will be accessed. This will be prefixes with
+        _OPENML_PREFIX
+
+    data_home : str
+        Directory to which the files will be cached. If None, no caching will
+        be applied.
+
+    Returns
+    -------
+    result : stream
+        A stream to the OpenML resource
+    """
+    def is_gzip(_fsrc):
+        return _fsrc.info().get('Content-Encoding', '') == 'gzip'
+
+    req = Request(_OPENML_PREFIX + openml_path)
+    req.add_header('Accept-encoding', 'gzip')
+
+    if data_home is None:
+        fsrc = urlopen(req)
+        if is_gzip(fsrc):
+            return gzip.GzipFile(fileobj=fsrc, mode='rb')
+        return fsrc
+
+    local_path = _get_local_path(openml_path, data_home)
+    if not os.path.exists(local_path):
+        try:
+            os.makedirs(os.path.dirname(local_path))
+        except OSError:
+            # potentially, the directory has been created already
+            pass
+
+        try:
+            with closing(urlopen(req)) as fsrc:
+                if is_gzip(fsrc):
+                    with open(local_path, 'wb') as fdst:
+                        shutil.copyfileobj(fsrc, fdst)
+                else:
+                    with gzip.GzipFile(local_path, 'wb') as fdst:
+                        shutil.copyfileobj(fsrc, fdst)
+        except Exception:
+            if os.path.exists(local_path):
+                os.unlink(local_path)
+            raise
+
+    # XXX: First time, decompression will not be necessary (by using fsrc), but
+    # it will happen nonetheless
+    return gzip.GzipFile(local_path, 'rb')
+
+
+def _get_json_content_from_openml_api(url, error_message, raise_if_error,
+                                      data_home):
+    """
+    Loads json data from the openml api
+
+    Parameters
+    ----------
+    url : str
+        The URL to load from. Should be an official OpenML endpoint
+
+    error_message : str or None
+        The error message to raise if an acceptable OpenML error is thrown
+        (acceptable error is, e.g., data id not found. Other errors, like 404's
+        will throw the native error message)
+
+    raise_if_error : bool
+        Whether to raise an error if OpenML returns an acceptable error (e.g.,
+        date not found). If this argument is set to False, a None is returned
+        in case of acceptable errors. Note that all other errors (e.g., 404)
+        will still be raised as normal.
+
+    data_home : str or None
+        Location to cache the response. None if no cache is required.
+
+    Returns
+    -------
+    json_data : json or None
+        the json result from the OpenML server if the call was successful;
+        None otherwise iff raise_if_error was set to False and the error was
+        ``acceptable``
+    """
+
+    @_retry_with_clean_cache(url, data_home)
+    def _load_json():
+        with closing(_open_openml_url(url, data_home)) as response:
+            return json.loads(response.read().decode("utf-8"))
+
+    try:
+        return _load_json()
+    except HTTPError as error:
+        # 412 is an OpenML specific error code, indicating a generic error
+        # (e.g., data not found)
+        if error.code != 412:
+            raise error
+
+    # 412 error, not in except for nicer traceback
+    if raise_if_error:
+        raise ValueError(error_message)
+    return None
+
+
+def _split_sparse_columns(arff_data, include_columns):
+    """
+    obtains several columns from sparse arff representation. Additionally, the
+    column indices are re-labelled, given the columns that are not included.
+    (e.g., when including [1, 2, 3], the columns will be relabelled to
+    [0, 1, 2])
+
+    Parameters
+    ----------
+    arff_data : tuple
+        A tuple of three lists of equal size; first list indicating the value,
+        second the x coordinate and the third the y coordinate.
+
+    include_columns : list
+        A list of columns to include.
+
+    Returns
+    -------
+    arff_data_new : tuple
+        Subset of arff data with only the include columns indicated by the
+        include_columns argument.
+    """
+    arff_data_new = (list(), list(), list())
+    reindexed_columns = {column_idx: array_idx for array_idx, column_idx
+                         in enumerate(include_columns)}
+    for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
+        if col_idx in include_columns:
+            arff_data_new[0].append(val)
+            arff_data_new[1].append(row_idx)
+            arff_data_new[2].append(reindexed_columns[col_idx])
+    return arff_data_new
+
+
+def _sparse_data_to_array(arff_data, include_columns):
+    # turns the sparse data back into an array (can't use toarray() function,
+    # as this does only work on numeric data)
+    num_obs = max(arff_data[1]) + 1
+    y_shape = (num_obs, len(include_columns))
+    reindexed_columns = {column_idx: array_idx for array_idx, column_idx
+                         in enumerate(include_columns)}
+    # TODO: improve for efficiency
+    y = np.empty(y_shape, dtype=np.float64)
+    for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
+        if col_idx in include_columns:
+            y[row_idx, reindexed_columns[col_idx]] = val
+    return y
+
+
+def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None):
+    """
+    converts the arff object into the appropriate matrix type (np.array or
+    scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
+    liac-arff dict, the object from the 'data' key)
+
+    Parameters
+    ----------
+    arff : dict
+        As obtained from liac-arff object.
+
+    col_slice_x : list
+        The column indices that are sliced from the original array to return
+        as X data
+
+    col_slice_y : list
+        The column indices that are sliced from the original array to return
+        as y data
+
+    Returns
+    -------
+    X : np.array or scipy.sparse.csr_matrix
+    y : np.array
+    """
+    arff_data = arff['data']
+    if isinstance(arff_data, Generator):
+        if shape[0] == -1:
+            count = -1
+        else:
+            count = shape[0] * shape[1]
+        data = np.fromiter(itertools.chain.from_iterable(arff_data),
+                           dtype='float64', count=count)
+        data = data.reshape(*shape)
+        X = data[:, col_slice_x]
+        y = data[:, col_slice_y]
+        return X, y
+    elif isinstance(arff_data, tuple):
+        arff_data_X = _split_sparse_columns(arff_data, col_slice_x)
+        num_obs = max(arff_data[1]) + 1
+        X_shape = (num_obs, len(col_slice_x))
+        X = scipy.sparse.coo_matrix(
+            (arff_data_X[0], (arff_data_X[1], arff_data_X[2])),
+            shape=X_shape, dtype=np.float64)
+        X = X.tocsr()
+        y = _sparse_data_to_array(arff_data, col_slice_y)
+        return X, y
+    else:
+        # This should never happen
+        raise ValueError('Unexpected Data Type obtained from arff.')
+
+
+def _feature_to_dtype(feature):
+    """Map feature to dtype for pandas DataFrame
+    """
+    if feature['data_type'] == 'string':
+        return object
+    elif feature['data_type'] == 'nominal':
+        return 'category'
+    # only numeric, integer, real are left
+    elif (feature['number_of_missing_values'] != '0' or
+          feature['data_type'] in ['numeric', 'real']):
+        # cast to floats when there are any missing values
+        return np.float64
+    elif feature['data_type'] == 'integer':
+        return np.int64
+    raise ValueError('Unsupported feature: {}'.format(feature))
+
+
+def _convert_arff_data_dataframe(arff, columns, features_dict):
+    """Convert the ARFF object into a pandas DataFrame.
+
+    Parameters
+    ----------
+    arff : dict
+        As obtained from liac-arff object.
+
+    columns : list
+        Columns from dataframe to return.
+
+    features_dict : dict
+        Maps feature name to feature info from openml.
+
+    Returns
+    -------
+    result : tuple
+        tuple with the resulting dataframe
+    """
+    pd = check_pandas_support('fetch_openml with as_frame=True')
+
+    attributes = OrderedDict(arff['attributes'])
+    arff_columns = list(attributes)
+
+    # calculate chunksize
+    first_row = next(arff['data'])
+    first_df = pd.DataFrame([first_row], columns=arff_columns)
+
+    row_bytes = first_df.memory_usage(deep=True).sum()
+    chunksize = get_chunk_n_rows(row_bytes)
+
+    # read arff data with chunks
+    columns_to_keep = [col for col in arff_columns if col in columns]
+    dfs = []
+    dfs.append(first_df[columns_to_keep])
+    for data in _chunk_generator(arff['data'], chunksize):
+        dfs.append(pd.DataFrame(data, columns=arff_columns)[columns_to_keep])
+    df = pd.concat(dfs, ignore_index=True)
+
+    for column in columns_to_keep:
+        dtype = _feature_to_dtype(features_dict[column])
+        if dtype == 'category':
+            dtype = pd.api.types.CategoricalDtype(attributes[column])
+        df[column] = df[column].astype(dtype, copy=False)
+    return (df, )
+
+
+def _get_data_info_by_name(name, version, data_home):
+    """
+    Utilizes the openml dataset listing api to find a dataset by
+    name/version
+    OpenML api function:
+    https://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name
+
+    Parameters
+    ----------
+    name : str
+        name of the dataset
+
+    version : int or str
+        If version is an integer, the exact name/version will be obtained from
+        OpenML. If version is a string (value: "active") it will take the first
+        version from OpenML that is annotated as active. Any other string
+        values except "active" are treated as integer.
+
+    data_home : str or None
+        Location to cache the response. None if no cache is required.
+
+    Returns
+    -------
+    first_dataset : json
+        json representation of the first dataset object that adhired to the
+        search criteria
+
+    """
+    if version == "active":
+        # situation in which we return the oldest active version
+        url = _SEARCH_NAME.format(name) + "/status/active/"
+        error_msg = "No active dataset {} found.".format(name)
+        json_data = _get_json_content_from_openml_api(url, error_msg, True,
+                                                      data_home)
+        res = json_data['data']['dataset']
+        if len(res) > 1:
+            warn("Multiple active versions of the dataset matching the name"
+                 " {name} exist. Versions may be fundamentally different, "
+                 "returning version"
+                 " {version}.".format(name=name, version=res[0]['version']))
+        return res[0]
+
+    # an integer version has been provided
+    url = (_SEARCH_NAME + "/data_version/{}").format(name, version)
+    json_data = _get_json_content_from_openml_api(url, None, False,
+                                                  data_home)
+    if json_data is None:
+        # we can do this in 1 function call if OpenML does not require the
+        # specification of the dataset status (i.e., return datasets with a
+        # given name / version regardless of active, deactivated, etc. )
+        # TODO: feature request OpenML.
+        url += "/status/deactivated"
+        error_msg = "Dataset {} with version {} not found.".format(name,
+                                                                   version)
+        json_data = _get_json_content_from_openml_api(url, error_msg, True,
+                                                      data_home)
+
+    return json_data['data']['dataset'][0]
+
+
+def _get_data_description_by_id(data_id, data_home):
+    # OpenML API function: https://www.openml.org/api_docs#!/data/get_data_id
+    url = _DATA_INFO.format(data_id)
+    error_message = "Dataset with data_id {} not found.".format(data_id)
+    json_data = _get_json_content_from_openml_api(url, error_message, True,
+                                                  data_home)
+    return json_data['data_set_description']
+
+
+def _get_data_features(data_id, data_home):
+    # OpenML function:
+    # https://www.openml.org/api_docs#!/data/get_data_features_id
+    url = _DATA_FEATURES.format(data_id)
+    error_message = "Dataset with data_id {} not found.".format(data_id)
+    json_data = _get_json_content_from_openml_api(url, error_message, True,
+                                                  data_home)
+    return json_data['data_features']['feature']
+
+
+def _get_data_qualities(data_id, data_home):
+    # OpenML API function:
+    # https://www.openml.org/api_docs#!/data/get_data_qualities_id
+    url = _DATA_QUALITIES.format(data_id)
+    error_message = "Dataset with data_id {} not found.".format(data_id)
+    json_data = _get_json_content_from_openml_api(url, error_message, True,
+                                                  data_home)
+    try:
+        return json_data['data_qualities']['quality']
+    except KeyError:
+        # the qualities might not be available, but we still try to process
+        # the data
+        return None
+
+
+def _get_num_samples(data_qualities):
+    """Get the number of samples from data qualities.
+
+    Parameters
+    ----------
+    data_qualities : list of dict
+        Used to retrieve the number of instances (samples) in the dataset.
+
+    Returns
+    -------
+    n_samples : int
+        The number of samples in the dataset or -1 if data qualities are
+        unavailable.
+    """
+    # If the data qualities are unavailable, we return -1
+    default_n_samples = -1
+
+    if data_qualities is None:
+        return default_n_samples
+
+    qualities = {d['name']: d['value'] for d in data_qualities}
+    return int(float(qualities.get('NumberOfInstances', default_n_samples)))
+
+
+def _load_arff_response(url, data_home, return_type, encode_nominal,
+                        parse_arff):
+    """Load arff data with url and parses arff response with parse_arff"""
+    response = _open_openml_url(url, data_home)
+
+    with closing(response):
+        # Note that if the data is dense, no reading is done until the data
+        # generator is iterated.
+        arff = _arff.load((line.decode('utf-8') for line in response),
+                          return_type=return_type,
+                          encode_nominal=encode_nominal)
+        return parse_arff(arff)
+
+
+def _download_data_to_bunch(url, sparse, data_home, *,
+                            as_frame, features_list, data_columns,
+                            target_columns, shape):
+    """Download OpenML ARFF and convert to Bunch of data
+    """
+    # NB: this function is long in order to handle retry for any failure
+    #     during the streaming parse of the ARFF.
+
+    # Prepare which columns and data types should be returned for the X and y
+    features_dict = {feature['name']: feature for feature in features_list}
+
+    # XXX: col_slice_y should be all nominal or all numeric
+    _verify_target_data_type(features_dict, target_columns)
+
+    col_slice_y = [int(features_dict[col_name]['index'])
+                   for col_name in target_columns]
+
+    col_slice_x = [int(features_dict[col_name]['index'])
+                   for col_name in data_columns]
+    for col_idx in col_slice_y:
+        feat = features_list[col_idx]
+        nr_missing = int(feat['number_of_missing_values'])
+        if nr_missing > 0:
+            raise ValueError('Target column {} has {} missing values. '
+                             'Missing values are not supported for target '
+                             'columns. '.format(feat['name'], nr_missing))
+
+    # Access an ARFF file on the OpenML server. Documentation:
+    # https://www.openml.org/api_data_docs#!/data/get_download_id
+
+    if sparse is True:
+        return_type = _arff.COO
+    else:
+        return_type = _arff.DENSE_GEN
+
+    frame = nominal_attributes = None
+    if as_frame:
+        columns = data_columns + target_columns
+        parse_arff = partial(_convert_arff_data_dataframe, columns=columns,
+                             features_dict=features_dict)
+
+        def postprocess(frame):  # type:ignore
+            X = frame[data_columns]
+            if len(target_columns) >= 2:
+                y = frame[target_columns]
+            elif len(target_columns) == 1:
+                y = frame[target_columns[0]]
+            else:
+                y = None
+            return X, y, frame, nominal_attributes
+    else:
+        def parse_arff(arff):
+            X, y = _convert_arff_data(arff, col_slice_x, col_slice_y, shape)
+            # nominal attributes is a dict mapping from the attribute name to
+            # the possible values. Includes also the target column (which will
+            # be popped off below, before it will be packed in the Bunch
+            # object)
+            nominal_attributes = {k: v for k, v in arff['attributes']
+                                  if isinstance(v, list) and
+                                  k in data_columns + target_columns}
+            return X, y, nominal_attributes
+
+        def postprocess(X, y, nominal_attributes):  # type:ignore
+            is_classification = {col_name in nominal_attributes
+                                 for col_name in target_columns}
+            if not is_classification:
+                # No target
+                pass
+            elif all(is_classification):
+                y = np.hstack([
+                    np.take(
+                        np.asarray(nominal_attributes.pop(col_name),
+                                   dtype='O'),
+                        y[:, i:i + 1].astype(int, copy=False))
+                    for i, col_name in enumerate(target_columns)
+                ])
+            elif any(is_classification):
+                raise ValueError('Mix of nominal and non-nominal targets is '
+                                 'not currently supported')
+
+            # reshape y back to 1-D array, if there is only 1 target column;
+            # back to None if there are not target columns
+            if y.shape[1] == 1:
+                y = y.reshape((-1,))
+            elif y.shape[1] == 0:
+                y = None
+            return X, y, frame, nominal_attributes
+
+    out = _retry_with_clean_cache(url, data_home)(
+        _load_arff_response)(url, data_home,
+                             return_type=return_type,
+                             encode_nominal=not as_frame,
+                             parse_arff=parse_arff)
+    X, y, frame, nominal_attributes = postprocess(*out)
+
+    return Bunch(data=X, target=y, frame=frame,
+                 categories=nominal_attributes,
+                 feature_names=data_columns,
+                 target_names=target_columns)
+
+
+def _verify_target_data_type(features_dict, target_columns):
+    # verifies the data type of the y array in case there are multiple targets
+    # (throws an error if these targets do not comply with sklearn support)
+    if not isinstance(target_columns, list):
+        raise ValueError('target_column should be list, '
+                         'got: %s' % type(target_columns))
+    found_types = set()
+    for target_column in target_columns:
+        if target_column not in features_dict:
+            raise KeyError('Could not find target_column={}')
+        if features_dict[target_column]['data_type'] == "numeric":
+            found_types.add(np.float64)
+        else:
+            found_types.add(object)
+
+        # note: we compare to a string, not boolean
+        if features_dict[target_column]['is_ignore'] == 'true':
+            warn('target_column={} has flag is_ignore.'.format(
+                target_column))
+        if features_dict[target_column]['is_row_identifier'] == 'true':
+            warn('target_column={} has flag is_row_identifier.'.format(
+                target_column))
+    if len(found_types) > 1:
+        raise ValueError('Can only handle homogeneous multi-target datasets, '
+                         'i.e., all targets are either numeric or '
+                         'categorical.')
+
+
+def _valid_data_column_names(features_list, target_columns):
+    # logic for determining on which columns can be learned. Note that from the
+    # OpenML guide follows that columns that have the `is_row_identifier` or
+    # `is_ignore` flag, these can not be learned on. Also target columns are
+    # excluded.
+    valid_data_column_names = []
+    for feature in features_list:
+        if (feature['name'] not in target_columns
+                and feature['is_ignore'] != 'true'
+                and feature['is_row_identifier'] != 'true'):
+            valid_data_column_names.append(feature['name'])
+    return valid_data_column_names
+
+
+@_deprecate_positional_args
+def fetch_openml(name=None, *, version='active', data_id=None, data_home=None,
+                 target_column='default-target', cache=True, return_X_y=False,
+                 as_frame=False):
+    """Fetch dataset from openml by name or dataset id.
+
+    Datasets are uniquely identified by either an integer ID or by a
+    combination of name and version (i.e. there might be multiple
+    versions of the 'iris' dataset). Please give either name or data_id
+    (not both). In case a name is given, a version can also be
+    provided.
+
+    Read more in the :ref:`User Guide <openml>`.
+
+    .. versionadded:: 0.20
+
+    .. note:: EXPERIMENTAL
+
+        The API is experimental (particularly the return value structure),
+        and might have small backward-incompatible changes in future releases.
+
+    Parameters
+    ----------
+    name : str or None
+        String identifier of the dataset. Note that OpenML can have multiple
+        datasets with the same name.
+
+    version : integer or 'active', default='active'
+        Version of the dataset. Can only be provided if also ``name`` is given.
+        If 'active' the oldest version that's still active is used. Since
+        there may be more than one active version of a dataset, and those
+        versions may fundamentally be different from one another, setting an
+        exact version is highly recommended.
+
+    data_id : int or None
+        OpenML ID of the dataset. The most specific way of retrieving a
+        dataset. If data_id is not given, name (and potential version) are
+        used to obtain a dataset.
+
+    data_home : string or None, default None
+        Specify another download and cache folder for the data sets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    target_column : string, list or None, default 'default-target'
+        Specify the column name in the data to use as target. If
+        'default-target', the standard target column a stored on the server
+        is used. If ``None``, all columns are returned as data and the
+        target is ``None``. If list (of strings), all columns with these names
+        are returned as multi-target (Note: not all scikit-learn classifiers
+        can handle all types of multi-output combinations)
+
+    cache : boolean, default=True
+        Whether to cache downloaded datasets using joblib.
+
+    return_X_y : boolean, default=False.
+        If True, returns ``(data, target)`` instead of a Bunch object. See
+        below for more information about the `data` and `target` objects.
+
+    as_frame : boolean, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric, string or categorical). The target is
+        a pandas DataFrame or Series depending on the number of target_columns.
+        The Bunch will contain a ``frame`` attribute with the target and the
+        data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas
+        DataFrames or Series as describe above.
+
+    Returns
+    -------
+
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame
+            The feature matrix. Categorical features are encoded as ordinals.
+        target : np.array, pandas Series or DataFrame
+            The regression target or classification labels, if applicable.
+            Dtype is float if numeric, and object if categorical. If
+            ``as_frame`` is True, ``target`` is a pandas object.
+        DESCR : str
+            The full description of the dataset
+        feature_names : list
+            The names of the dataset columns
+        target_names: list
+            The names of the target columns
+
+        .. versionadded:: 0.22
+
+        categories : dict or None
+            Maps each categorical feature name to a list of values, such
+            that the value encoded as i is ith in the list. If ``as_frame``
+            is True, this is None.
+        details : dict
+            More metadata from OpenML
+        frame : pandas DataFrame
+            Only present when `as_frame=True`. DataFrame with ``data`` and
+            ``target``.
+
+    (data, target) : tuple if ``return_X_y`` is True
+
+        .. note:: EXPERIMENTAL
+
+            This interface is **experimental** and subsequent releases may
+            change attributes without notice (although there should only be
+            minor changes to ``data`` and ``target``).
+
+        Missing values in the 'data' are represented as NaN's. Missing values
+        in 'target' are represented as NaN's (numerical target) or None
+        (categorical target)
+    """
+    data_home = get_data_home(data_home=data_home)
+    data_home = join(data_home, 'openml')
+    if cache is False:
+        # no caching will be applied
+        data_home = None
+
+    # check valid function arguments. data_id XOR (name, version) should be
+    # provided
+    if name is not None:
+        # OpenML is case-insensitive, but the caching mechanism is not
+        # convert all data names (str) to lower case
+        name = name.lower()
+        if data_id is not None:
+            raise ValueError(
+                "Dataset data_id={} and name={} passed, but you can only "
+                "specify a numeric data_id or a name, not "
+                "both.".format(data_id, name))
+        data_info = _get_data_info_by_name(name, version, data_home)
+        data_id = data_info['did']
+    elif data_id is not None:
+        # from the previous if statement, it is given that name is None
+        if version != "active":
+            raise ValueError(
+                "Dataset data_id={} and version={} passed, but you can only "
+                "specify a numeric data_id or a version, not "
+                "both.".format(data_id, name))
+    else:
+        raise ValueError(
+            "Neither name nor data_id are provided. Please provide name or "
+            "data_id.")
+
+    data_description = _get_data_description_by_id(data_id, data_home)
+    if data_description['status'] != "active":
+        warn("Version {} of dataset {} is inactive, meaning that issues have "
+             "been found in the dataset. Try using a newer version from "
+             "this URL: {}".format(
+                data_description['version'],
+                data_description['name'],
+                data_description['url']))
+    if 'error' in data_description:
+        warn("OpenML registered a problem with the dataset. It might be "
+             "unusable. Error: {}".format(data_description['error']))
+    if 'warning' in data_description:
+        warn("OpenML raised a warning on the dataset. It might be "
+             "unusable. Warning: {}".format(data_description['warning']))
+
+    return_sparse = False
+    if data_description['format'].lower() == 'sparse_arff':
+        return_sparse = True
+
+    if as_frame and return_sparse:
+        raise ValueError('Cannot return dataframe with sparse data')
+
+    # download data features, meta-info about column types
+    features_list = _get_data_features(data_id, data_home)
+
+    if not as_frame:
+        for feature in features_list:
+            if 'true' in (feature['is_ignore'], feature['is_row_identifier']):
+                continue
+            if feature['data_type'] == 'string':
+                raise ValueError('STRING attributes are not supported for '
+                                 'array representation. Try as_frame=True')
+
+    if target_column == "default-target":
+        # determines the default target based on the data feature results
+        # (which is currently more reliable than the data description;
+        # see issue: https://github.com/openml/OpenML/issues/768)
+        target_columns = [feature['name'] for feature in features_list
+                          if feature['is_target'] == 'true']
+    elif isinstance(target_column, str):
+        # for code-simplicity, make target_column by default a list
+        target_columns = [target_column]
+    elif target_column is None:
+        target_columns = []
+    elif isinstance(target_column, list):
+        target_columns = target_column
+    else:
+        raise TypeError("Did not recognize type of target_column"
+                        "Should be str, list or None. Got: "
+                        "{}".format(type(target_column)))
+    data_columns = _valid_data_column_names(features_list,
+                                            target_columns)
+
+    # determine arff encoding to return
+    if not return_sparse:
+        # The shape must include the ignored features to keep the right indexes
+        # during the arff data conversion.
+        data_qualities = _get_data_qualities(data_id, data_home)
+        shape = _get_num_samples(data_qualities), len(features_list)
+    else:
+        shape = None
+
+    # obtain the data
+    url = _DATA_FILE.format(data_description['file_id'])
+    bunch = _download_data_to_bunch(url, return_sparse, data_home,
+                                    as_frame=as_frame,
+                                    features_list=features_list, shape=shape,
+                                    target_columns=target_columns,
+                                    data_columns=data_columns)
+
+    if return_X_y:
+        return bunch.data, bunch.target
+
+    description = "{}\n\nDownloaded from openml.org.".format(
+        data_description.pop('description'))
+
+    bunch.update(
+        DESCR=description, details=data_description,
+        url="https://www.openml.org/d/{}".format(data_id))
+
+    return bunch