Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/metrics/init.py
+++ b/venv/Lib/site-packages/sklearn/metrics/init.py
@ -0,0 +1,158 @@
+"""
+The :mod:`sklearn.metrics` module includes score functions, performance metrics
+and pairwise metrics and distance computations.
+"""
+
+
+from ._ranking import auc
+from ._ranking import average_precision_score
+from ._ranking import coverage_error
+from ._ranking import dcg_score
+from ._ranking import label_ranking_average_precision_score
+from ._ranking import label_ranking_loss
+from ._ranking import ndcg_score
+from ._ranking import precision_recall_curve
+from ._ranking import roc_auc_score
+from ._ranking import roc_curve
+
+from ._classification import accuracy_score
+from ._classification import balanced_accuracy_score
+from ._classification import classification_report
+from ._classification import cohen_kappa_score
+from ._classification import confusion_matrix
+from ._classification import f1_score
+from ._classification import fbeta_score
+from ._classification import hamming_loss
+from ._classification import hinge_loss
+from ._classification import jaccard_score
+from ._classification import log_loss
+from ._classification import matthews_corrcoef
+from ._classification import precision_recall_fscore_support
+from ._classification import precision_score
+from ._classification import recall_score
+from ._classification import zero_one_loss
+from ._classification import brier_score_loss
+from ._classification import multilabel_confusion_matrix
+
+from . import cluster
+from .cluster import adjusted_mutual_info_score
+from .cluster import adjusted_rand_score
+from .cluster import completeness_score
+from .cluster import consensus_score
+from .cluster import homogeneity_completeness_v_measure
+from .cluster import homogeneity_score
+from .cluster import mutual_info_score
+from .cluster import normalized_mutual_info_score
+from .cluster import fowlkes_mallows_score
+from .cluster import silhouette_samples
+from .cluster import silhouette_score
+from .cluster import calinski_harabasz_score
+from .cluster import v_measure_score
+from .cluster import davies_bouldin_score
+
+from .pairwise import euclidean_distances
+from .pairwise import nan_euclidean_distances
+from .pairwise import pairwise_distances
+from .pairwise import pairwise_distances_argmin
+from .pairwise import pairwise_distances_argmin_min
+from .pairwise import pairwise_kernels
+from .pairwise import pairwise_distances_chunked
+
+from ._regression import explained_variance_score
+from ._regression import max_error
+from ._regression import mean_absolute_error
+from ._regression import mean_squared_error
+from ._regression import mean_squared_log_error
+from ._regression import median_absolute_error
+from ._regression import r2_score
+from ._regression import mean_tweedie_deviance
+from ._regression import mean_poisson_deviance
+from ._regression import mean_gamma_deviance
+
+
+from ._scorer import check_scoring
+from ._scorer import make_scorer
+from ._scorer import SCORERS
+from ._scorer import get_scorer
+
+from ._plot.roc_curve import plot_roc_curve
+from ._plot.roc_curve import RocCurveDisplay
+from ._plot.precision_recall_curve import plot_precision_recall_curve
+from ._plot.precision_recall_curve import PrecisionRecallDisplay
+
+from ._plot.confusion_matrix import plot_confusion_matrix
+from ._plot.confusion_matrix import ConfusionMatrixDisplay
+
+
+__all__ = [
+    'accuracy_score',
+    'adjusted_mutual_info_score',
+    'adjusted_rand_score',
+    'auc',
+    'average_precision_score',
+    'balanced_accuracy_score',
+    'calinski_harabasz_score',
+    'check_scoring',
+    'classification_report',
+    'cluster',
+    'cohen_kappa_score',
+    'completeness_score',
+    'ConfusionMatrixDisplay',
+    'confusion_matrix',
+    'consensus_score',
+    'coverage_error',
+    'dcg_score',
+    'davies_bouldin_score',
+    'euclidean_distances',
+    'explained_variance_score',
+    'f1_score',
+    'fbeta_score',
+    'fowlkes_mallows_score',
+    'get_scorer',
+    'hamming_loss',
+    'hinge_loss',
+    'homogeneity_completeness_v_measure',
+    'homogeneity_score',
+    'jaccard_score',
+    'label_ranking_average_precision_score',
+    'label_ranking_loss',
+    'log_loss',
+    'make_scorer',
+    'nan_euclidean_distances',
+    'matthews_corrcoef',
+    'max_error',
+    'mean_absolute_error',
+    'mean_squared_error',
+    'mean_squared_log_error',
+    'mean_poisson_deviance',
+    'mean_gamma_deviance',
+    'mean_tweedie_deviance',
+    'median_absolute_error',
+    'multilabel_confusion_matrix',
+    'mutual_info_score',
+    'ndcg_score',
+    'normalized_mutual_info_score',
+    'pairwise_distances',
+    'pairwise_distances_argmin',
+    'pairwise_distances_argmin_min',
+    'pairwise_distances_chunked',
+    'pairwise_kernels',
+    'plot_confusion_matrix',
+    'plot_precision_recall_curve',
+    'plot_roc_curve',
+    'PrecisionRecallDisplay',
+    'precision_recall_curve',
+    'precision_recall_fscore_support',
+    'precision_score',
+    'r2_score',
+    'recall_score',
+    'RocCurveDisplay',
+    'roc_auc_score',
+    'roc_curve',
+    'SCORERS',
+    'silhouette_samples',
+    'silhouette_score',
+    'v_measure_score',
+    'zero_one_loss',
+    'brier_score_loss',
+]
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/_base.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/_base.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/_classification.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/_classification.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/_ranking.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/_ranking.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/_regression.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/_regression.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/_scorer.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/_scorer.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/base.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/base.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/classification.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/classification.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/pairwise.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/pairwise.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/pairwise_fast.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/pairwise_fast.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/ranking.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/ranking.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/regression.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/regression.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/scorer.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/scorer.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/pycache/setup.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/pycache/setup.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_base.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_base.py
@ -0,0 +1,202 @@
+"""
+Common code for all metrics
+
+"""
+# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#          Mathieu Blondel <mathieu@mblondel.org>
+#          Olivier Grisel <olivier.grisel@ensta.org>
+#          Arnaud Joly <a.joly@ulg.ac.be>
+#          Jochen Wersdorfer <jochen@wersdoerfer.de>
+#          Lars Buitinck
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Noel Dawe <noel@dawe.me>
+# License: BSD 3 clause
+
+from itertools import combinations
+
+import numpy as np
+
+from ..utils import check_array, check_consistent_length
+from ..utils.multiclass import type_of_target
+
+
+def _average_binary_score(binary_metric, y_true, y_score, average,
+                          sample_weight=None):
+    """Average a binary metric for multilabel classification
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples] or [n_samples, n_classes]
+        True binary labels in binary label indicators.
+
+    y_score : array, shape = [n_samples] or [n_samples, n_classes]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or binary decisions.
+
+    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+        If ``None``, the scores for each class are returned. Otherwise,
+        this determines the type of averaging performed on the data:
+
+        ``'micro'``:
+            Calculate metrics globally by considering each element of the label
+            indicator matrix as a label.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label).
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average.
+
+        Will be ignored when ``y_true`` is binary.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    binary_metric : callable, returns shape [n_classes]
+        The binary metric function to use.
+
+    Returns
+    -------
+    score : float or array of shape [n_classes]
+        If not ``None``, average the score, else return the score for each
+        classes.
+
+    """
+    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
+    if average not in average_options:
+        raise ValueError('average has to be one of {0}'
+                         ''.format(average_options))
+
+    y_type = type_of_target(y_true)
+    if y_type not in ("binary", "multilabel-indicator"):
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    if y_type == "binary":
+        return binary_metric(y_true, y_score, sample_weight=sample_weight)
+
+    check_consistent_length(y_true, y_score, sample_weight)
+    y_true = check_array(y_true)
+    y_score = check_array(y_score)
+
+    not_average_axis = 1
+    score_weight = sample_weight
+    average_weight = None
+
+    if average == "micro":
+        if score_weight is not None:
+            score_weight = np.repeat(score_weight, y_true.shape[1])
+        y_true = y_true.ravel()
+        y_score = y_score.ravel()
+
+    elif average == 'weighted':
+        if score_weight is not None:
+            average_weight = np.sum(np.multiply(
+                y_true, np.reshape(score_weight, (-1, 1))), axis=0)
+        else:
+            average_weight = np.sum(y_true, axis=0)
+        if np.isclose(average_weight.sum(), 0.0):
+            return 0
+
+    elif average == 'samples':
+        # swap average_weight <-> score_weight
+        average_weight = score_weight
+        score_weight = None
+        not_average_axis = 0
+
+    if y_true.ndim == 1:
+        y_true = y_true.reshape((-1, 1))
+
+    if y_score.ndim == 1:
+        y_score = y_score.reshape((-1, 1))
+
+    n_classes = y_score.shape[not_average_axis]
+    score = np.zeros((n_classes,))
+    for c in range(n_classes):
+        y_true_c = y_true.take([c], axis=not_average_axis).ravel()
+        y_score_c = y_score.take([c], axis=not_average_axis).ravel()
+        score[c] = binary_metric(y_true_c, y_score_c,
+                                 sample_weight=score_weight)
+
+    # Average the results
+    if average is not None:
+        if average_weight is not None:
+            # Scores with 0 weights are forced to be 0, preventing the average
+            # score from being affected by 0-weighted NaN elements.
+            average_weight = np.asarray(average_weight)
+            score[average_weight == 0] = 0
+        return np.average(score, weights=average_weight)
+    else:
+        return score
+
+
+def _average_multiclass_ovo_score(binary_metric, y_true, y_score,
+                                  average='macro'):
+    """Average one-versus-one scores for multiclass classification.
+
+    Uses the binary metric for one-vs-one multiclass classification,
+    where the score is computed according to the Hand & Till (2001) algorithm.
+
+    Parameters
+    ----------
+    binary_metric : callable
+        The binary metric function to use that accepts the following as input
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
+    y_true : array-like of shape (n_samples,)
+        True multiclass labels.
+
+    y_score : array-like of shape (n_samples, n_classes)
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class
+
+    average : 'macro' or 'weighted', optional (default='macro')
+        Determines the type of averaging performed on the pairwise binary
+        metric scores
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the
+            prevalence of the classes.
+
+    Returns
+    -------
+    score : float
+        Average of the pairwise binary metric scores
+    """
+    check_consistent_length(y_true, y_score)
+
+    y_true_unique = np.unique(y_true)
+    n_classes = y_true_unique.shape[0]
+    n_pairs = n_classes * (n_classes - 1) // 2
+    pair_scores = np.empty(n_pairs)
+
+    is_weighted = average == "weighted"
+    prevalence = np.empty(n_pairs) if is_weighted else None
+
+    # Compute scores treating a as positive class and b as negative class,
+    # then b as positive class and a as negative class
+    for ix, (a, b) in enumerate(combinations(y_true_unique, 2)):
+        a_mask = y_true == a
+        b_mask = y_true == b
+        ab_mask = np.logical_or(a_mask, b_mask)
+
+        if is_weighted:
+            prevalence[ix] = np.average(ab_mask)
+
+        a_true = a_mask[ab_mask]
+        b_true = b_mask[ab_mask]
+
+        a_true_score = binary_metric(a_true, y_score[ab_mask, a])
+        b_true_score = binary_metric(b_true, y_score[ab_mask, b])
+        pair_scores[ix] = (a_true_score + b_true_score) / 2
+
+    return np.average(pair_scores, weights=prevalence)
--- a/venv/Lib/site-packages/sklearn/metrics/_classification.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_classification.py
--- a/venv/Lib/site-packages/sklearn/metrics/_pairwise_fast.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/metrics/_pairwise_fast.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/init.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/init.py
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/base.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/base.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/confusion_matrix.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/confusion_matrix.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/precision_recall_curve.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/precision_recall_curve.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/roc_curve.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/pycache/roc_curve.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/base.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/base.py
@ -0,0 +1,40 @@
+def _check_classifer_response_method(estimator, response_method):
+    """Return prediction method from the response_method
+
+    Parameters
+    ----------
+    estimator: object
+        Classifier to check
+
+    response_method: {'auto', 'predict_proba', 'decision_function'}
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. If set to 'auto',
+        :term:`predict_proba` is tried first and if it does not exist
+        :term:`decision_function` is tried next.
+
+    Returns
+    -------
+    prediction_method: callable
+        prediction method of estimator
+    """
+
+    if response_method not in ("predict_proba", "decision_function", "auto"):
+        raise ValueError("response_method must be 'predict_proba', "
+                         "'decision_function' or 'auto'")
+
+    error_msg = "response method {} is not defined in {}"
+    if response_method != "auto":
+        prediction_method = getattr(estimator, response_method, None)
+        if prediction_method is None:
+            raise ValueError(error_msg.format(response_method,
+                                              estimator.__class__.__name__))
+    else:
+        predict_proba = getattr(estimator, 'predict_proba', None)
+        decision_function = getattr(estimator, 'decision_function', None)
+        prediction_method = predict_proba or decision_function
+        if prediction_method is None:
+            raise ValueError(error_msg.format(
+                "decision_function or predict_proba",
+                estimator.__class__.__name__))
+
+    return prediction_method
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/confusion_matrix.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/confusion_matrix.py
@ -0,0 +1,233 @@
+from itertools import product
+
+import numpy as np
+
+from .. import confusion_matrix
+from ...utils import check_matplotlib_support
+from ...utils.validation import _deprecate_positional_args
+from ...base import is_classifier
+
+
+class ConfusionMatrixDisplay:
+    """Confusion Matrix visualization.
+
+    It is recommend to use :func:`~sklearn.metrics.plot_confusion_matrix` to
+    create a :class:`ConfusionMatrixDisplay`. All parameters are stored as
+    attributes.
+
+    Read more in the :ref:`User Guide <visualizations>`.
+
+    Parameters
+    ----------
+    confusion_matrix : ndarray of shape (n_classes, n_classes)
+        Confusion matrix.
+
+    display_labels : ndarray of shape (n_classes,), default=None
+        Display labels for plot. If None, display labels are set from 0 to
+        `n_classes - 1`.
+
+    Attributes
+    ----------
+    im_ : matplotlib AxesImage
+        Image representing the confusion matrix.
+
+    text_ : ndarray of shape (n_classes, n_classes), dtype=matplotlib Text, \
+            or None
+        Array of matplotlib axes. `None` if `include_values` is false.
+
+    ax_ : matplotlib Axes
+        Axes with confusion matrix.
+
+    figure_ : matplotlib Figure
+        Figure containing the confusion matrix.
+    """
+    def __init__(self, confusion_matrix, *, display_labels=None):
+        self.confusion_matrix = confusion_matrix
+        self.display_labels = display_labels
+
+    @_deprecate_positional_args
+    def plot(self, *, include_values=True, cmap='viridis',
+             xticks_rotation='horizontal', values_format=None, ax=None):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        include_values : bool, default=True
+            Includes values in confusion matrix.
+
+        cmap : str or matplotlib Colormap, default='viridis'
+            Colormap recognized by matplotlib.
+
+        xticks_rotation : {'vertical', 'horizontal'} or float, \
+                         default='horizontal'
+            Rotation of xtick labels.
+
+        values_format : str, default=None
+            Format specification for values in confusion matrix. If `None`,
+            the format specification is 'd' or '.2g' whichever is shorter.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
+        """
+        check_matplotlib_support("ConfusionMatrixDisplay.plot")
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            fig, ax = plt.subplots()
+        else:
+            fig = ax.figure
+
+        cm = self.confusion_matrix
+        n_classes = cm.shape[0]
+        self.im_ = ax.imshow(cm, interpolation='nearest', cmap=cmap)
+        self.text_ = None
+        cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(256)
+
+        if include_values:
+            self.text_ = np.empty_like(cm, dtype=object)
+
+            # print text with appropriate color depending on background
+            thresh = (cm.max() + cm.min()) / 2.0
+
+            for i, j in product(range(n_classes), range(n_classes)):
+                color = cmap_max if cm[i, j] < thresh else cmap_min
+
+                if values_format is None:
+                    text_cm = format(cm[i, j], '.2g')
+                    if cm.dtype.kind != 'f':
+                        text_d = format(cm[i, j], 'd')
+                        if len(text_d) < len(text_cm):
+                            text_cm = text_d
+                else:
+                    text_cm = format(cm[i, j], values_format)
+
+                self.text_[i, j] = ax.text(
+                    j, i, text_cm,
+                    ha="center", va="center",
+                    color=color)
+
+        if self.display_labels is None:
+            display_labels = np.arange(n_classes)
+        else:
+            display_labels = self.display_labels
+
+        fig.colorbar(self.im_, ax=ax)
+        ax.set(xticks=np.arange(n_classes),
+               yticks=np.arange(n_classes),
+               xticklabels=display_labels,
+               yticklabels=display_labels,
+               ylabel="True label",
+               xlabel="Predicted label")
+
+        ax.set_ylim((n_classes - 0.5, -0.5))
+        plt.setp(ax.get_xticklabels(), rotation=xticks_rotation)
+
+        self.figure_ = fig
+        self.ax_ = ax
+        return self
+
+
+@_deprecate_positional_args
+def plot_confusion_matrix(estimator, X, y_true, *, labels=None,
+                          sample_weight=None, normalize=None,
+                          display_labels=None, include_values=True,
+                          xticks_rotation='horizontal',
+                          values_format=None,
+                          cmap='viridis', ax=None):
+    """Plot Confusion Matrix.
+
+    Read more in the :ref:`User Guide <confusion_matrix>`.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a classifier.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Input values.
+
+    y : array-like of shape (n_samples,)
+        Target values.
+
+    labels : array-like of shape (n_classes,), default=None
+        List of labels to index the matrix. This may be used to reorder or
+        select a subset of labels. If `None` is given, those that appear at
+        least once in `y_true` or `y_pred` are used in sorted order.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    normalize : {'true', 'pred', 'all'}, default=None
+        Normalizes confusion matrix over the true (rows), predicted (columns)
+        conditions or all the population. If None, confusion matrix will not be
+        normalized.
+
+    display_labels : array-like of shape (n_classes,), default=None
+        Target names used for plotting. By default, `labels` will be used if
+        it is defined, otherwise the unique labels of `y_true` and `y_pred`
+        will be used.
+
+    include_values : bool, default=True
+        Includes values in confusion matrix.
+
+    xticks_rotation : {'vertical', 'horizontal'} or float, \
+                        default='horizontal'
+        Rotation of xtick labels.
+
+    values_format : str, default=None
+        Format specification for values in confusion matrix. If `None`,
+        the format specification is 'd' or '.2g' whichever is shorter.
+
+    cmap : str or matplotlib Colormap, default='viridis'
+        Colormap recognized by matplotlib.
+
+    ax : matplotlib Axes, default=None
+        Axes object to plot on. If `None`, a new figure and axes is
+        created.
+
+    Returns
+    -------
+    display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt  # doctest: +SKIP
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.metrics import plot_confusion_matrix
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.svm import SVC
+    >>> X, y = make_classification(random_state=0)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...         X, y, random_state=0)
+    >>> clf = SVC(random_state=0)
+    >>> clf.fit(X_train, y_train)
+    SVC(random_state=0)
+    >>> plot_confusion_matrix(clf, X_test, y_test)  # doctest: +SKIP
+    >>> plt.show()  # doctest: +SKIP
+    """
+    check_matplotlib_support("plot_confusion_matrix")
+
+    if not is_classifier(estimator):
+        raise ValueError("plot_confusion_matrix only supports classifiers")
+
+    y_pred = estimator.predict(X)
+    cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight,
+                          labels=labels, normalize=normalize)
+
+    if display_labels is None:
+        if labels is None:
+            display_labels = estimator.classes_
+        else:
+            display_labels = labels
+
+    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
+                                  display_labels=display_labels)
+    return disp.plot(include_values=include_values,
+                     cmap=cmap, ax=ax, xticks_rotation=xticks_rotation,
+                     values_format=values_format)
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/precision_recall_curve.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/precision_recall_curve.py
@ -0,0 +1,181 @@
+from .base import _check_classifer_response_method
+
+from .. import average_precision_score
+from .. import precision_recall_curve
+
+from ...utils import check_matplotlib_support
+from ...utils.validation import _deprecate_positional_args
+from ...base import is_classifier
+
+
+class PrecisionRecallDisplay:
+    """Precision Recall visualization.
+
+    It is recommend to use :func:`~sklearn.metrics.plot_precision_recall_curve`
+    to create a visualizer. All parameters are stored as attributes.
+
+    Read more in the :ref:`User Guide <visualizations>`.
+
+    Parameters
+    -----------
+    precision : ndarray
+        Precision values.
+
+    recall : ndarray
+        Recall values.
+
+    average_precision : float, default=None
+        Average precision. If None, the average precision is not shown.
+
+    estimator_name : str, default=None
+        Name of estimator. If None, then the estimator name is not shown.
+
+    Attributes
+    ----------
+    line_ : matplotlib Artist
+        Precision recall curve.
+
+    ax_ : matplotlib Axes
+        Axes with precision recall curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the curve.
+    """
+    def __init__(self, precision, recall, *,
+                 average_precision=None, estimator_name=None):
+        self.precision = precision
+        self.recall = recall
+        self.average_precision = average_precision
+        self.estimator_name = estimator_name
+
+    @_deprecate_positional_args
+    def plot(self, ax=None, *, name=None, **kwargs):
+        """Plot visualization.
+
+        Extra keyword arguments will be passed to matplotlib's `plot`.
+
+        Parameters
+        ----------
+        ax : Matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        name : str, default=None
+            Name of precision recall curve for labeling. If `None`, use the
+            name of the estimator.
+
+        **kwargs : dict
+            Keyword arguments to be passed to matplotlib's `plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
+            Object that stores computed values.
+        """
+        check_matplotlib_support("PrecisionRecallDisplay.plot")
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            fig, ax = plt.subplots()
+
+        name = self.estimator_name if name is None else name
+
+        line_kwargs = {"drawstyle": "steps-post"}
+        if self.average_precision is not None and name is not None:
+            line_kwargs["label"] = (f"{name} (AP = "
+                                    f"{self.average_precision:0.2f})")
+        elif self.average_precision is not None:
+            line_kwargs["label"] = (f"AP = "
+                                    f"{self.average_precision:0.2f}")
+        elif name is not None:
+            line_kwargs["label"] = name
+        line_kwargs.update(**kwargs)
+
+        self.line_, = ax.plot(self.recall, self.precision, **line_kwargs)
+        ax.set(xlabel="Recall", ylabel="Precision")
+
+        if "label" in line_kwargs:
+            ax.legend(loc='lower left')
+
+        self.ax_ = ax
+        self.figure_ = ax.figure
+        return self
+
+
+@_deprecate_positional_args
+def plot_precision_recall_curve(estimator, X, y, *,
+                                sample_weight=None, response_method="auto",
+                                name=None, ax=None, **kwargs):
+    """Plot Precision Recall Curve for binary classifiers.
+
+    Extra keyword arguments will be passed to matplotlib's `plot`.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a classifier.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Input values.
+
+    y : array-like of shape (n_samples,)
+        Binary target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    response_method : {'predict_proba', 'decision_function', 'auto'}, \
+                      default='auto'
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. If set to 'auto',
+        :term:`predict_proba` is tried first and if it does not exist
+        :term:`decision_function` is tried next.
+
+    name : str, default=None
+        Name for labeling curve. If `None`, the name of the
+        estimator is used.
+
+    ax : matplotlib axes, default=None
+        Axes object to plot on. If `None`, a new figure and axes is created.
+
+    **kwargs : dict
+        Keyword arguments to be passed to matplotlib's `plot`.
+
+    Returns
+    -------
+    display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
+        Object that stores computed values.
+    """
+    check_matplotlib_support("plot_precision_recall_curve")
+
+    classification_error = ("{} should be a binary classifier".format(
+        estimator.__class__.__name__))
+    if not is_classifier(estimator):
+        raise ValueError(classification_error)
+
+    prediction_method = _check_classifer_response_method(estimator,
+                                                         response_method)
+    y_pred = prediction_method(X)
+
+    if y_pred.ndim != 1:
+        if y_pred.shape[1] != 2:
+            raise ValueError(classification_error)
+        else:
+            y_pred = y_pred[:, 1]
+
+    pos_label = estimator.classes_[1]
+    precision, recall, _ = precision_recall_curve(y, y_pred,
+                                                  pos_label=pos_label,
+                                                  sample_weight=sample_weight)
+    average_precision = average_precision_score(y, y_pred,
+                                                pos_label=pos_label,
+                                                sample_weight=sample_weight)
+    name = name if name is not None else estimator.__class__.__name__
+    viz = PrecisionRecallDisplay(
+        precision=precision, recall=recall,
+        average_precision=average_precision, estimator_name=name
+    )
+    return viz.plot(ax=ax, name=name, **kwargs)
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/roc_curve.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/roc_curve.py
@ -0,0 +1,203 @@
+from .. import auc
+from .. import roc_curve
+
+from .base import _check_classifer_response_method
+from ...utils import check_matplotlib_support
+from ...base import is_classifier
+from ...utils.validation import _deprecate_positional_args
+
+
+class RocCurveDisplay:
+    """ROC Curve visualization.
+
+    It is recommend to use :func:`~sklearn.metrics.plot_roc_curve` to create a
+    visualizer. All parameters are stored as attributes.
+
+    Read more in the :ref:`User Guide <visualizations>`.
+
+    Parameters
+    ----------
+    fpr : ndarray
+        False positive rate.
+
+    tpr : ndarray
+        True positive rate.
+
+    roc_auc : float, default=None
+        Area under ROC curve. If None, the roc_auc score is not shown.
+
+    estimator_name : str, default=None
+        Name of estimator. If None, the estimator name is not shown.
+
+    Attributes
+    ----------
+    line_ : matplotlib Artist
+        ROC Curve.
+
+    ax_ : matplotlib Axes
+        Axes with ROC Curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the curve.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt  # doctest: +SKIP
+    >>> import numpy as np
+    >>> from sklearn import metrics
+    >>> y = np.array([0, 0, 1, 1])
+    >>> pred = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred)
+    >>> roc_auc = metrics.auc(fpr, tpr)
+    >>> display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,\
+                                          estimator_name='example estimator')
+    >>> display.plot()  # doctest: +SKIP
+    >>> plt.show()      # doctest: +SKIP
+    """
+    def __init__(self, *, fpr, tpr, roc_auc=None, estimator_name=None):
+        self.fpr = fpr
+        self.tpr = tpr
+        self.roc_auc = roc_auc
+        self.estimator_name = estimator_name
+
+    @_deprecate_positional_args
+    def plot(self, ax=None, *, name=None, **kwargs):
+        """Plot visualization
+
+        Extra keyword arguments will be passed to matplotlib's ``plot``.
+
+        Parameters
+        ----------
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        name : str, default=None
+            Name of ROC Curve for labeling. If `None`, use the name of the
+            estimator.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.plot.RocCurveDisplay`
+            Object that stores computed values.
+        """
+        check_matplotlib_support('RocCurveDisplay.plot')
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            fig, ax = plt.subplots()
+
+        name = self.estimator_name if name is None else name
+
+        line_kwargs = {}
+        if self.roc_auc is not None and name is not None:
+            line_kwargs["label"] = f"{name} (AUC = {self.roc_auc:0.2f})"
+        elif self.roc_auc is not None:
+            line_kwargs["label"] = f"AUC = {self.roc_auc:0.2f}"
+        elif name is not None:
+            line_kwargs["label"] = name
+
+        line_kwargs.update(**kwargs)
+
+        self.line_ = ax.plot(self.fpr, self.tpr, **line_kwargs)[0]
+        ax.set_xlabel("False Positive Rate")
+        ax.set_ylabel("True Positive Rate")
+
+        if "label" in line_kwargs:
+            ax.legend(loc='lower right')
+
+        self.ax_ = ax
+        self.figure_ = ax.figure
+        return self
+
+
+@_deprecate_positional_args
+def plot_roc_curve(estimator, X, y, *, sample_weight=None,
+                   drop_intermediate=True, response_method="auto",
+                   name=None, ax=None, **kwargs):
+    """Plot Receiver operating characteristic (ROC) curve.
+
+    Extra keyword arguments will be passed to matplotlib's `plot`.
+
+    Read more in the :ref:`User Guide <visualizations>`.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a classifier.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Input values.
+
+    y : array-like of shape (n_samples,)
+        Target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    drop_intermediate : boolean, default=True
+        Whether to drop some suboptimal thresholds which would not appear
+        on a plotted ROC curve. This is useful in order to create lighter
+        ROC curves.
+
+    response_method : {'predict_proba', 'decision_function', 'auto'} \
+    default='auto'
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. If set to 'auto',
+        :term:`predict_proba` is tried first and if it does not exist
+        :term:`decision_function` is tried next.
+
+    name : str, default=None
+        Name of ROC Curve for labeling. If `None`, use the name of the
+        estimator.
+
+    ax : matplotlib axes, default=None
+        Axes object to plot on. If `None`, a new figure and axes is created.
+
+    Returns
+    -------
+    display : :class:`~sklearn.metrics.RocCurveDisplay`
+        Object that stores computed values.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt  # doctest: +SKIP
+    >>> from sklearn import datasets, metrics, model_selection, svm
+    >>> X, y = datasets.make_classification(random_state=0)
+    >>> X_train, X_test, y_train, y_test = model_selection.train_test_split(\
+            X, y, random_state=0)
+    >>> clf = svm.SVC(random_state=0)
+    >>> clf.fit(X_train, y_train)
+    SVC(random_state=0)
+    >>> metrics.plot_roc_curve(clf, X_test, y_test)  # doctest: +SKIP
+    >>> plt.show()                                   # doctest: +SKIP
+    """
+    check_matplotlib_support('plot_roc_curve')
+
+    classification_error = (
+        "{} should be a binary classifier".format(estimator.__class__.__name__)
+    )
+    if not is_classifier(estimator):
+        raise ValueError(classification_error)
+
+    prediction_method = _check_classifer_response_method(estimator,
+                                                         response_method)
+    y_pred = prediction_method(X)
+
+    if y_pred.ndim != 1:
+        if y_pred.shape[1] != 2:
+            raise ValueError(classification_error)
+        else:
+            y_pred = y_pred[:, 1]
+
+    pos_label = estimator.classes_[1]
+    fpr, tpr, _ = roc_curve(y, y_pred, pos_label=pos_label,
+                            sample_weight=sample_weight,
+                            drop_intermediate=drop_intermediate)
+    roc_auc = auc(fpr, tpr)
+    name = estimator.__class__.__name__ if name is None else name
+    viz = RocCurveDisplay(
+        fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=name
+    )
+    return viz.plot(ax=ax, name=name, **kwargs)
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/tests/init.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/tests/init.py
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/tests/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/tests/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/tests/pycache/test_plot_confusion_matrix.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/tests/pycache/test_plot_confusion_matrix.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/tests/pycache/test_plot_precision_recall.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/tests/pycache/test_plot_precision_recall.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/tests/pycache/test_plot_roc_curve.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/tests/pycache/test_plot_roc_curve.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
@ -0,0 +1,299 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+from numpy.testing import assert_array_equal
+
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import make_classification
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC, SVR
+
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import ConfusionMatrixDisplay
+
+
+# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:"
+    "matplotlib.*")
+
+
+@pytest.fixture(scope="module")
+def n_classes():
+    return 5
+
+
+@pytest.fixture(scope="module")
+def data(n_classes):
+    X, y = make_classification(n_samples=100, n_informative=5,
+                               n_classes=n_classes, random_state=0)
+    return X, y
+
+
+@pytest.fixture(scope="module")
+def fitted_clf(data):
+    return SVC(kernel='linear', C=0.01).fit(*data)
+
+
+@pytest.fixture(scope="module")
+def y_pred(data, fitted_clf):
+    X, _ = data
+    return fitted_clf.predict(X)
+
+
+def test_error_on_regressor(pyplot, data):
+    X, y = data
+    est = SVR().fit(X, y)
+
+    msg = "plot_confusion_matrix only supports classifiers"
+    with pytest.raises(ValueError, match=msg):
+        plot_confusion_matrix(est, X, y)
+
+
+def test_error_on_invalid_option(pyplot, fitted_clf, data):
+    X, y = data
+    msg = (r"normalize must be one of \{'true', 'pred', 'all', "
+           r"None\}")
+
+    with pytest.raises(ValueError, match=msg):
+        plot_confusion_matrix(fitted_clf, X, y, normalize='invalid')
+
+
+@pytest.mark.parametrize("with_labels", [True, False])
+@pytest.mark.parametrize("with_display_labels", [True, False])
+def test_plot_confusion_matrix_custom_labels(pyplot, data, y_pred, fitted_clf,
+                                             n_classes, with_labels,
+                                             with_display_labels):
+    X, y = data
+    ax = pyplot.gca()
+    labels = [2, 1, 0, 3, 4] if with_labels else None
+    display_labels = ['b', 'd', 'a', 'e', 'f'] if with_display_labels else None
+
+    cm = confusion_matrix(y, y_pred, labels=labels)
+    disp = plot_confusion_matrix(fitted_clf, X, y,
+                                 ax=ax, display_labels=display_labels,
+                                 labels=labels)
+
+    assert_allclose(disp.confusion_matrix, cm)
+
+    if with_display_labels:
+        expected_display_labels = display_labels
+    elif with_labels:
+        expected_display_labels = labels
+    else:
+        expected_display_labels = list(range(n_classes))
+
+    expected_display_labels_str = [str(name)
+                                   for name in expected_display_labels]
+
+    x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
+
+    assert_array_equal(disp.display_labels, expected_display_labels)
+    assert_array_equal(x_ticks, expected_display_labels_str)
+    assert_array_equal(y_ticks, expected_display_labels_str)
+
+
+@pytest.mark.parametrize("normalize", ['true', 'pred', 'all', None])
+@pytest.mark.parametrize("include_values", [True, False])
+def test_plot_confusion_matrix(pyplot, data, y_pred, n_classes, fitted_clf,
+                               normalize, include_values):
+    X, y = data
+    ax = pyplot.gca()
+    cmap = 'plasma'
+    cm = confusion_matrix(y, y_pred)
+    disp = plot_confusion_matrix(fitted_clf, X, y,
+                                 normalize=normalize,
+                                 cmap=cmap, ax=ax,
+                                 include_values=include_values)
+
+    assert disp.ax_ == ax
+
+    if normalize == 'true':
+        cm = cm / cm.sum(axis=1, keepdims=True)
+    elif normalize == 'pred':
+        cm = cm / cm.sum(axis=0, keepdims=True)
+    elif normalize == 'all':
+        cm = cm / cm.sum()
+
+    assert_allclose(disp.confusion_matrix, cm)
+    import matplotlib as mpl
+    assert isinstance(disp.im_, mpl.image.AxesImage)
+    assert disp.im_.get_cmap().name == cmap
+    assert isinstance(disp.ax_, pyplot.Axes)
+    assert isinstance(disp.figure_, pyplot.Figure)
+
+    assert disp.ax_.get_ylabel() == "True label"
+    assert disp.ax_.get_xlabel() == "Predicted label"
+
+    x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
+
+    expected_display_labels = list(range(n_classes))
+
+    expected_display_labels_str = [str(name)
+                                   for name in expected_display_labels]
+
+    assert_array_equal(disp.display_labels, expected_display_labels)
+    assert_array_equal(x_ticks, expected_display_labels_str)
+    assert_array_equal(y_ticks, expected_display_labels_str)
+
+    image_data = disp.im_.get_array().data
+    assert_allclose(image_data, cm)
+
+    if include_values:
+        assert disp.text_.shape == (n_classes, n_classes)
+        fmt = '.2g'
+        expected_text = np.array([format(v, fmt) for v in cm.ravel(order="C")])
+        text_text = np.array([
+            t.get_text() for t in disp.text_.ravel(order="C")])
+        assert_array_equal(expected_text, text_text)
+    else:
+        assert disp.text_ is None
+
+
+def test_confusion_matrix_display(pyplot, data, fitted_clf, y_pred, n_classes):
+    X, y = data
+
+    cm = confusion_matrix(y, y_pred)
+    disp = plot_confusion_matrix(fitted_clf, X, y, normalize=None,
+                                 include_values=True, cmap='viridis',
+                                 xticks_rotation=45.0)
+
+    assert_allclose(disp.confusion_matrix, cm)
+    assert disp.text_.shape == (n_classes, n_classes)
+
+    rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
+    assert_allclose(rotations, 45.0)
+
+    image_data = disp.im_.get_array().data
+    assert_allclose(image_data, cm)
+
+    disp.plot(cmap='plasma')
+    assert disp.im_.get_cmap().name == 'plasma'
+
+    disp.plot(include_values=False)
+    assert disp.text_ is None
+
+    disp.plot(xticks_rotation=90.0)
+    rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
+    assert_allclose(rotations, 90.0)
+
+    disp.plot(values_format='e')
+    expected_text = np.array([format(v, 'e') for v in cm.ravel(order="C")])
+    text_text = np.array([
+        t.get_text() for t in disp.text_.ravel(order="C")])
+    assert_array_equal(expected_text, text_text)
+
+
+def test_confusion_matrix_contrast(pyplot):
+    # make sure text color is appropriate depending on background
+
+    cm = np.eye(2) / 2
+    disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
+
+    disp.plot(cmap=pyplot.cm.gray)
+    # diagonal text is black
+    assert_allclose(disp.text_[0, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
+    assert_allclose(disp.text_[1, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
+
+    # off-diagonal text is white
+    assert_allclose(disp.text_[0, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
+    assert_allclose(disp.text_[1, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
+
+    disp.plot(cmap=pyplot.cm.gray_r)
+    # diagonal text is white
+    assert_allclose(disp.text_[0, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
+    assert_allclose(disp.text_[1, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
+
+    # off-diagonal text is black
+    assert_allclose(disp.text_[0, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
+    assert_allclose(disp.text_[1, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
+
+    # Regression test for #15920
+    cm = np.array([[19, 34], [32, 58]])
+    disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
+
+    disp.plot(cmap=pyplot.cm.Blues)
+    min_color = pyplot.cm.Blues(0)
+    max_color = pyplot.cm.Blues(255)
+    assert_allclose(disp.text_[0, 0].get_color(), max_color)
+    assert_allclose(disp.text_[0, 1].get_color(), max_color)
+    assert_allclose(disp.text_[1, 0].get_color(), max_color)
+    assert_allclose(disp.text_[1, 1].get_color(), min_color)
+
+
+@pytest.mark.parametrize(
+    "clf", [LogisticRegression(),
+            make_pipeline(StandardScaler(), LogisticRegression()),
+            make_pipeline(make_column_transformer((StandardScaler(), [0, 1])),
+                          LogisticRegression())])
+def test_confusion_matrix_pipeline(pyplot, clf, data, n_classes):
+    X, y = data
+    with pytest.raises(NotFittedError):
+        plot_confusion_matrix(clf, X, y)
+    clf.fit(X, y)
+    y_pred = clf.predict(X)
+
+    disp = plot_confusion_matrix(clf, X, y)
+    cm = confusion_matrix(y, y_pred)
+
+    assert_allclose(disp.confusion_matrix, cm)
+    assert disp.text_.shape == (n_classes, n_classes)
+
+
+@pytest.mark.parametrize("values_format", ['e', 'n'])
+def test_confusion_matrix_text_format(pyplot, data, y_pred, n_classes,
+                                      fitted_clf, values_format):
+    # Make sure plot text is formatted with 'values_format'.
+    X, y = data
+    cm = confusion_matrix(y, y_pred)
+    disp = plot_confusion_matrix(fitted_clf, X, y,
+                                 include_values=True,
+                                 values_format=values_format)
+
+    assert disp.text_.shape == (n_classes, n_classes)
+
+    expected_text = np.array([format(v, values_format)
+                              for v in cm.ravel()])
+    text_text = np.array([
+        t.get_text() for t in disp.text_.ravel()])
+    assert_array_equal(expected_text, text_text)
+
+
+def test_confusion_matrix_standard_format(pyplot):
+    cm = np.array([[10000000, 0], [123456, 12345678]])
+    plotted_text = ConfusionMatrixDisplay(
+        cm, display_labels=[False, True]).plot().text_
+    # Values should be shown as whole numbers 'd',
+    # except the first number which should be shown as 1e+07 (longer length)
+    # and the last number will be shown as 1.2e+07 (longer length)
+    test = [t.get_text() for t in plotted_text.ravel()]
+    assert test == ['1e+07', '0', '123456', '1.2e+07']
+
+    cm = np.array([[0.1, 10], [100, 0.525]])
+    plotted_text = ConfusionMatrixDisplay(
+        cm, display_labels=[False, True]).plot().text_
+    # Values should now formatted as '.2g', since there's a float in
+    # Values are have two dec places max, (e.g 100 becomes 1e+02)
+    test = [t.get_text() for t in plotted_text.ravel()]
+    assert test == ['0.1', '10', '1e+02', '0.53']
+
+
+@pytest.mark.parametrize("display_labels, expected_labels", [
+    (None, ["0", "1"]),
+    (["cat", "dog"], ["cat", "dog"]),
+])
+def test_default_labels(pyplot, display_labels, expected_labels):
+    cm = np.array([[10, 0], [12, 120]])
+    disp = ConfusionMatrixDisplay(cm, display_labels=display_labels).plot()
+
+    x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
+
+    assert_array_equal(x_ticks, expected_labels)
+    assert_array_equal(y_ticks, expected_labels)
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/tests/test_plot_precision_recall.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/tests/test_plot_precision_recall.py
@ -0,0 +1,192 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.metrics import plot_precision_recall_curve
+from sklearn.metrics import PrecisionRecallDisplay
+from sklearn.metrics import average_precision_score
+from sklearn.metrics import precision_recall_curve
+from sklearn.datasets import make_classification
+from sklearn.datasets import load_breast_cancer
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.linear_model import LogisticRegression
+from sklearn.exceptions import NotFittedError
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import make_column_transformer
+
+
+# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:"
+    "matplotlib.*")
+
+
+def test_errors(pyplot):
+    X, y_multiclass = make_classification(n_classes=3, n_samples=50,
+                                          n_informative=3,
+                                          random_state=0)
+    y_binary = y_multiclass == 0
+
+    # Unfitted classifer
+    binary_clf = DecisionTreeClassifier()
+    with pytest.raises(NotFittedError):
+        plot_precision_recall_curve(binary_clf, X, y_binary)
+    binary_clf.fit(X, y_binary)
+
+    multi_clf = DecisionTreeClassifier().fit(X, y_multiclass)
+
+    # Fitted multiclass classifier with binary data
+    msg = "DecisionTreeClassifier should be a binary classifier"
+    with pytest.raises(ValueError, match=msg):
+        plot_precision_recall_curve(multi_clf, X, y_binary)
+
+    reg = DecisionTreeRegressor().fit(X, y_multiclass)
+    msg = "DecisionTreeRegressor should be a binary classifier"
+    with pytest.raises(ValueError, match=msg):
+        plot_precision_recall_curve(reg, X, y_binary)
+
+
+@pytest.mark.parametrize(
+    "response_method, msg",
+    [("predict_proba", "response method predict_proba is not defined in "
+                       "MyClassifier"),
+     ("decision_function", "response method decision_function is not defined "
+                           "in MyClassifier"),
+     ("auto", "response method decision_function or predict_proba is not "
+              "defined in MyClassifier"),
+     ("bad_method", "response_method must be 'predict_proba', "
+                    "'decision_function' or 'auto'")])
+def test_error_bad_response(pyplot, response_method, msg):
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    class MyClassifier(BaseEstimator, ClassifierMixin):
+        def fit(self, X, y):
+            self.fitted_ = True
+            self.classes_ = [0, 1]
+            return self
+
+    clf = MyClassifier().fit(X, y)
+
+    with pytest.raises(ValueError, match=msg):
+        plot_precision_recall_curve(clf, X, y, response_method=response_method)
+
+
+@pytest.mark.parametrize("response_method",
+                         ["predict_proba", "decision_function"])
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+def test_plot_precision_recall(pyplot, response_method, with_sample_weight):
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    lr = LogisticRegression().fit(X, y)
+
+    if with_sample_weight:
+        rng = np.random.RandomState(42)
+        sample_weight = rng.randint(0, 4, size=X.shape[0])
+    else:
+        sample_weight = None
+
+    disp = plot_precision_recall_curve(lr, X, y, alpha=0.8,
+                                       response_method=response_method,
+                                       sample_weight=sample_weight)
+
+    y_score = getattr(lr, response_method)(X)
+    if response_method == 'predict_proba':
+        y_score = y_score[:, 1]
+
+    prec, recall, _ = precision_recall_curve(y, y_score,
+                                             sample_weight=sample_weight)
+    avg_prec = average_precision_score(y, y_score, sample_weight=sample_weight)
+
+    assert_allclose(disp.precision, prec)
+    assert_allclose(disp.recall, recall)
+    assert disp.average_precision == pytest.approx(avg_prec)
+
+    assert disp.estimator_name == "LogisticRegression"
+
+    # cannot fail thanks to pyplot fixture
+    import matplotlib as mpl  # noqa
+    assert isinstance(disp.line_, mpl.lines.Line2D)
+    assert disp.line_.get_alpha() == 0.8
+    assert isinstance(disp.ax_, mpl.axes.Axes)
+    assert isinstance(disp.figure_, mpl.figure.Figure)
+
+    expected_label = "LogisticRegression (AP = {:0.2f})".format(avg_prec)
+    assert disp.line_.get_label() == expected_label
+    assert disp.ax_.get_xlabel() == "Recall"
+    assert disp.ax_.get_ylabel() == "Precision"
+
+    # draw again with another label
+    disp.plot(name="MySpecialEstimator")
+    expected_label = "MySpecialEstimator (AP = {:0.2f})".format(avg_prec)
+    assert disp.line_.get_label() == expected_label
+
+
+@pytest.mark.parametrize(
+    "clf", [make_pipeline(StandardScaler(), LogisticRegression()),
+            make_pipeline(make_column_transformer((StandardScaler(), [0, 1])),
+                          LogisticRegression())])
+def test_precision_recall_curve_pipeline(pyplot, clf):
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+    with pytest.raises(NotFittedError):
+        plot_precision_recall_curve(clf, X, y)
+    clf.fit(X, y)
+    disp = plot_precision_recall_curve(clf, X, y)
+    assert disp.estimator_name == clf.__class__.__name__
+
+
+def test_precision_recall_curve_string_labels(pyplot):
+    # regression test #15738
+    cancer = load_breast_cancer()
+    X = cancer.data
+    y = cancer.target_names[cancer.target]
+
+    lr = make_pipeline(StandardScaler(), LogisticRegression())
+    lr.fit(X, y)
+    for klass in cancer.target_names:
+        assert klass in lr.classes_
+    disp = plot_precision_recall_curve(lr, X, y)
+
+    y_pred = lr.predict_proba(X)[:, 1]
+    avg_prec = average_precision_score(y, y_pred,
+                                       pos_label=lr.classes_[1])
+
+    assert disp.average_precision == pytest.approx(avg_prec)
+    assert disp.estimator_name == lr.__class__.__name__
+
+
+def test_plot_precision_recall_curve_estimator_name_multiple_calls(pyplot):
+    # non-regression test checking that the `name` used when calling
+    # `plot_roc_curve` is used as well when calling `disp.plot()`
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+    clf_name = "my hand-crafted name"
+    clf = LogisticRegression().fit(X, y)
+    disp = plot_precision_recall_curve(clf, X, y, name=clf_name)
+    assert disp.estimator_name == clf_name
+    pyplot.close("all")
+    disp.plot()
+    assert clf_name in disp.line_.get_label()
+    pyplot.close("all")
+    clf_name = "another_name"
+    disp.plot(name=clf_name)
+    assert clf_name in disp.line_.get_label()
+
+
+@pytest.mark.parametrize(
+    "average_precision, estimator_name, expected_label",
+    [
+        (0.9, None, "AP = 0.90"),
+        (None, "my_est", "my_est"),
+        (0.8, "my_est2", "my_est2 (AP = 0.80)"),
+    ]
+)
+def test_default_labels(pyplot, average_precision, estimator_name,
+                        expected_label):
+    prec = np.array([1, 0.5, 0])
+    recall = np.array([0, 0.5, 1])
+    disp = PrecisionRecallDisplay(prec, recall,
+                                  average_precision=average_precision,
+                                  estimator_name=estimator_name)
+    disp.plot()
+    assert disp.line_.get_label() == expected_label
--- a/venv/Lib/site-packages/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
@ -0,0 +1,170 @@
+import pytest
+from numpy.testing import assert_allclose
+import numpy as np
+
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import plot_roc_curve
+from sklearn.metrics import RocCurveDisplay
+from sklearn.datasets import load_iris
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import roc_curve, auc
+from sklearn.base import ClassifierMixin
+from sklearn.exceptions import NotFittedError
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import make_column_transformer
+
+
+# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:"
+    "matplotlib.*")
+
+
+@pytest.fixture(scope="module")
+def data():
+    return load_iris(return_X_y=True)
+
+
+@pytest.fixture(scope="module")
+def data_binary(data):
+    X, y = data
+    return X[y < 2], y[y < 2]
+
+
+def test_plot_roc_curve_error_non_binary(pyplot, data):
+    X, y = data
+    clf = DecisionTreeClassifier()
+    clf.fit(X, y)
+
+    msg = "DecisionTreeClassifier should be a binary classifier"
+    with pytest.raises(ValueError, match=msg):
+        plot_roc_curve(clf, X, y)
+
+
+@pytest.mark.parametrize(
+    "response_method, msg",
+    [("predict_proba", "response method predict_proba is not defined in "
+                       "MyClassifier"),
+     ("decision_function", "response method decision_function is not defined "
+                           "in MyClassifier"),
+     ("auto", "response method decision_function or predict_proba is not "
+              "defined in MyClassifier"),
+     ("bad_method", "response_method must be 'predict_proba', "
+                    "'decision_function' or 'auto'")])
+def test_plot_roc_curve_error_no_response(pyplot, data_binary, response_method,
+                                          msg):
+    X, y = data_binary
+
+    class MyClassifier(ClassifierMixin):
+        def fit(self, X, y):
+            self.classes_ = [0, 1]
+            return self
+
+    clf = MyClassifier().fit(X, y)
+
+    with pytest.raises(ValueError, match=msg):
+        plot_roc_curve(clf, X, y, response_method=response_method)
+
+
+@pytest.mark.parametrize("response_method",
+                         ["predict_proba", "decision_function"])
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@pytest.mark.parametrize("drop_intermediate", [True, False])
+@pytest.mark.parametrize("with_strings", [True, False])
+def test_plot_roc_curve(pyplot, response_method, data_binary,
+                        with_sample_weight, drop_intermediate,
+                        with_strings):
+    X, y = data_binary
+
+    pos_label = None
+    if with_strings:
+        y = np.array(["c", "b"])[y]
+        pos_label = "c"
+
+    if with_sample_weight:
+        rng = np.random.RandomState(42)
+        sample_weight = rng.randint(1, 4, size=(X.shape[0]))
+    else:
+        sample_weight = None
+
+    lr = LogisticRegression()
+    lr.fit(X, y)
+
+    viz = plot_roc_curve(lr, X, y, alpha=0.8, sample_weight=sample_weight,
+                         drop_intermediate=drop_intermediate)
+
+    y_pred = getattr(lr, response_method)(X)
+    if y_pred.ndim == 2:
+        y_pred = y_pred[:, 1]
+
+    fpr, tpr, _ = roc_curve(y, y_pred, sample_weight=sample_weight,
+                            drop_intermediate=drop_intermediate,
+                            pos_label=pos_label)
+
+    assert_allclose(viz.roc_auc, auc(fpr, tpr))
+    assert_allclose(viz.fpr, fpr)
+    assert_allclose(viz.tpr, tpr)
+
+    assert viz.estimator_name == "LogisticRegression"
+
+    # cannot fail thanks to pyplot fixture
+    import matplotlib as mpl  # noqal
+    assert isinstance(viz.line_, mpl.lines.Line2D)
+    assert viz.line_.get_alpha() == 0.8
+    assert isinstance(viz.ax_, mpl.axes.Axes)
+    assert isinstance(viz.figure_, mpl.figure.Figure)
+
+    expected_label = "LogisticRegression (AUC = {:0.2f})".format(viz.roc_auc)
+    assert viz.line_.get_label() == expected_label
+    assert viz.ax_.get_ylabel() == "True Positive Rate"
+    assert viz.ax_.get_xlabel() == "False Positive Rate"
+
+
+@pytest.mark.parametrize(
+    "clf", [LogisticRegression(),
+            make_pipeline(StandardScaler(), LogisticRegression()),
+            make_pipeline(make_column_transformer((StandardScaler(), [0, 1])),
+                          LogisticRegression())])
+def test_roc_curve_not_fitted_errors(pyplot, data_binary, clf):
+    X, y = data_binary
+    with pytest.raises(NotFittedError):
+        plot_roc_curve(clf, X, y)
+    clf.fit(X, y)
+    disp = plot_roc_curve(clf, X, y)
+    assert clf.__class__.__name__ in disp.line_.get_label()
+    assert disp.estimator_name == clf.__class__.__name__
+
+
+def test_plot_roc_curve_estimator_name_multiple_calls(pyplot, data_binary):
+    # non-regression test checking that the `name` used when calling
+    # `plot_roc_curve` is used as well when calling `disp.plot()`
+    X, y = data_binary
+    clf_name = "my hand-crafted name"
+    clf = LogisticRegression().fit(X, y)
+    disp = plot_roc_curve(clf, X, y, name=clf_name)
+    assert disp.estimator_name == clf_name
+    pyplot.close("all")
+    disp.plot()
+    assert clf_name in disp.line_.get_label()
+    pyplot.close("all")
+    clf_name = "another_name"
+    disp.plot(name=clf_name)
+    assert clf_name in disp.line_.get_label()
+
+
+@pytest.mark.parametrize(
+    "roc_auc, estimator_name, expected_label",
+    [
+        (0.9, None, "AUC = 0.90"),
+        (None, "my_est", "my_est"),
+        (0.8, "my_est2", "my_est2 (AUC = 0.80)")
+    ]
+)
+def test_default_labels(pyplot, roc_auc, estimator_name,
+                        expected_label):
+    fpr = np.array([0, 0.5, 1])
+    tpr = np.array([0, 0.5, 1])
+    disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
+                           estimator_name=estimator_name).plot()
+    assert disp.line_.get_label() == expected_label
--- a/venv/Lib/site-packages/sklearn/metrics/_ranking.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_ranking.py
--- a/venv/Lib/site-packages/sklearn/metrics/_regression.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_regression.py
@ -0,0 +1,810 @@
+"""Metrics to assess performance on regression task
+
+Functions named as ``*_score`` return a scalar value to maximize: the higher
+the better
+
+Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
+the lower the better
+"""
+
+# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#          Mathieu Blondel <mathieu@mblondel.org>
+#          Olivier Grisel <olivier.grisel@ensta.org>
+#          Arnaud Joly <a.joly@ulg.ac.be>
+#          Jochen Wersdorfer <jochen@wersdoerfer.de>
+#          Lars Buitinck
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Karan Desai <karandesai281196@gmail.com>
+#          Noel Dawe <noel@dawe.me>
+#          Manoj Kumar <manojkumarsivaraj334@gmail.com>
+#          Michael Eickenberg <michael.eickenberg@gmail.com>
+#          Konstantin Shmelkov <konstantin.shmelkov@polytechnique.edu>
+#          Christian Lorentzen <lorentzen.ch@googlemail.com>
+# License: BSD 3 clause
+
+import numpy as np
+import warnings
+
+from .._loss.glm_distribution import TweedieDistribution
+from ..utils.validation import (check_array, check_consistent_length,
+                                _num_samples)
+from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
+from ..exceptions import UndefinedMetricWarning
+
+
+__ALL__ = [
+    "max_error",
+    "mean_absolute_error",
+    "mean_squared_error",
+    "mean_squared_log_error",
+    "median_absolute_error",
+    "r2_score",
+    "explained_variance_score",
+    "mean_tweedie_deviance",
+    "mean_poisson_deviance",
+    "mean_gamma_deviance",
+]
+
+
+def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
+    """Check that y_true and y_pred belong to the same regression task
+
+    Parameters
+    ----------
+    y_true : array-like
+
+    y_pred : array-like
+
+    multioutput : array-like or string in ['raw_values', uniform_average',
+        'variance_weighted'] or None
+        None is accepted due to backward compatibility of r2_score().
+
+    Returns
+    -------
+    type_true : one of {'continuous', continuous-multioutput'}
+        The type of the true target data, as output by
+        'utils.multiclass.type_of_target'
+
+    y_true : array-like of shape (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples, n_outputs)
+        Estimated target values.
+
+    multioutput : array-like of shape (n_outputs) or string in ['raw_values',
+        uniform_average', 'variance_weighted'] or None
+        Custom output weights if ``multioutput`` is array-like or
+        just the corresponding argument if ``multioutput`` is a
+        correct keyword.
+    dtype: str or list, default="numeric"
+        the dtype argument passed to check_array
+
+    """
+    check_consistent_length(y_true, y_pred)
+    y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
+    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
+
+    if y_true.ndim == 1:
+        y_true = y_true.reshape((-1, 1))
+
+    if y_pred.ndim == 1:
+        y_pred = y_pred.reshape((-1, 1))
+
+    if y_true.shape[1] != y_pred.shape[1]:
+        raise ValueError("y_true and y_pred have different number of output "
+                         "({0}!={1})".format(y_true.shape[1], y_pred.shape[1]))
+
+    n_outputs = y_true.shape[1]
+    allowed_multioutput_str = ('raw_values', 'uniform_average',
+                               'variance_weighted')
+    if isinstance(multioutput, str):
+        if multioutput not in allowed_multioutput_str:
+            raise ValueError("Allowed 'multioutput' string values are {}. "
+                             "You provided multioutput={!r}".format(
+                                 allowed_multioutput_str,
+                                 multioutput))
+    elif multioutput is not None:
+        multioutput = check_array(multioutput, ensure_2d=False)
+        if n_outputs == 1:
+            raise ValueError("Custom weights are useful only in "
+                             "multi-output cases.")
+        elif n_outputs != len(multioutput):
+            raise ValueError(("There must be equally many custom weights "
+                              "(%d) as outputs (%d).") %
+                             (len(multioutput), n_outputs))
+    y_type = 'continuous' if n_outputs == 1 else 'continuous-multioutput'
+
+    return y_type, y_true, y_pred, multioutput
+
+
+@_deprecate_positional_args
+def mean_absolute_error(y_true, y_pred, *,
+                        sample_weight=None,
+                        multioutput='uniform_average'):
+    """Mean absolute error regression loss
+
+    Read more in the :ref:`User Guide <mean_absolute_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), optional
+        Sample weights.
+
+    multioutput : string in ['raw_values', 'uniform_average'] \
+                or array-like of shape (n_outputs)
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        If multioutput is 'raw_values', then mean absolute error is returned
+        for each output separately.
+        If multioutput is 'uniform_average' or an ndarray of weights, then the
+        weighted average of all output errors is returned.
+
+        MAE output is non-negative floating point. The best value is 0.0.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_absolute_error
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> mean_absolute_error(y_true, y_pred)
+    0.5
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> mean_absolute_error(y_true, y_pred)
+    0.75
+    >>> mean_absolute_error(y_true, y_pred, multioutput='raw_values')
+    array([0.5, 1. ])
+    >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    0.85...
+    """
+    y_type, y_true, y_pred, multioutput = _check_reg_targets(
+        y_true, y_pred, multioutput)
+    check_consistent_length(y_true, y_pred, sample_weight)
+    output_errors = np.average(np.abs(y_pred - y_true),
+                               weights=sample_weight, axis=0)
+    if isinstance(multioutput, str):
+        if multioutput == 'raw_values':
+            return output_errors
+        elif multioutput == 'uniform_average':
+            # pass None as weights to np.average: uniform mean
+            multioutput = None
+
+    return np.average(output_errors, weights=multioutput)
+
+
+@_deprecate_positional_args
+def mean_squared_error(y_true, y_pred, *,
+                       sample_weight=None,
+                       multioutput='uniform_average', squared=True):
+    """Mean squared error regression loss
+
+    Read more in the :ref:`User Guide <mean_squared_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), optional
+        Sample weights.
+
+    multioutput : string in ['raw_values', 'uniform_average'] \
+                or array-like of shape (n_outputs)
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    squared : boolean value, optional (default = True)
+        If True returns MSE value, if False returns RMSE value.
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        A non-negative floating point value (the best value is 0.0), or an
+        array of floating point values, one for each individual target.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_squared_error
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> mean_squared_error(y_true, y_pred)
+    0.375
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> mean_squared_error(y_true, y_pred, squared=False)
+    0.612...
+    >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]
+    >>> y_pred = [[0, 2],[-1, 2],[8, -5]]
+    >>> mean_squared_error(y_true, y_pred)
+    0.708...
+    >>> mean_squared_error(y_true, y_pred, squared=False)
+    0.822...
+    >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')
+    array([0.41666667, 1.        ])
+    >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    0.825...
+
+    """
+    y_type, y_true, y_pred, multioutput = _check_reg_targets(
+        y_true, y_pred, multioutput)
+    check_consistent_length(y_true, y_pred, sample_weight)
+    output_errors = np.average((y_true - y_pred) ** 2, axis=0,
+                               weights=sample_weight)
+
+    if not squared:
+        output_errors = np.sqrt(output_errors)
+
+    if isinstance(multioutput, str):
+        if multioutput == 'raw_values':
+            return output_errors
+        elif multioutput == 'uniform_average':
+            # pass None as weights to np.average: uniform mean
+            multioutput = None
+
+    return np.average(output_errors, weights=multioutput)
+
+
+@_deprecate_positional_args
+def mean_squared_log_error(y_true, y_pred, *,
+                           sample_weight=None,
+                           multioutput='uniform_average'):
+    """Mean squared logarithmic error regression loss
+
+    Read more in the :ref:`User Guide <mean_squared_log_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), optional
+        Sample weights.
+
+    multioutput : string in ['raw_values', 'uniform_average'] \
+            or array-like of shape (n_outputs)
+
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors when the input is of multioutput
+            format.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        A non-negative floating point value (the best value is 0.0), or an
+        array of floating point values, one for each individual target.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_squared_log_error
+    >>> y_true = [3, 5, 2.5, 7]
+    >>> y_pred = [2.5, 5, 4, 8]
+    >>> mean_squared_log_error(y_true, y_pred)
+    0.039...
+    >>> y_true = [[0.5, 1], [1, 2], [7, 6]]
+    >>> y_pred = [[0.5, 2], [1, 2.5], [8, 8]]
+    >>> mean_squared_log_error(y_true, y_pred)
+    0.044...
+    >>> mean_squared_log_error(y_true, y_pred, multioutput='raw_values')
+    array([0.00462428, 0.08377444])
+    >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    0.060...
+
+    """
+    y_type, y_true, y_pred, multioutput = _check_reg_targets(
+        y_true, y_pred, multioutput)
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    if (y_true < 0).any() or (y_pred < 0).any():
+        raise ValueError("Mean Squared Logarithmic Error cannot be used when "
+                         "targets contain negative values.")
+
+    return mean_squared_error(np.log1p(y_true), np.log1p(y_pred),
+                              sample_weight=sample_weight,
+                              multioutput=multioutput)
+
+
+@_deprecate_positional_args
+def median_absolute_error(y_true, y_pred, *, multioutput='uniform_average'):
+    """Median absolute error regression loss
+
+    Median absolute error output is non-negative floating point. The best value
+    is 0.0. Read more in the :ref:`User Guide <median_absolute_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
+        Estimated target values.
+
+    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
+                (n_outputs,)
+        Defines aggregating of multiple output values. Array-like value defines
+        weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        If multioutput is 'raw_values', then mean absolute error is returned
+        for each output separately.
+        If multioutput is 'uniform_average' or an ndarray of weights, then the
+        weighted average of all output errors is returned.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import median_absolute_error
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> median_absolute_error(y_true, y_pred)
+    0.5
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> median_absolute_error(y_true, y_pred)
+    0.75
+    >>> median_absolute_error(y_true, y_pred, multioutput='raw_values')
+    array([0.5, 1. ])
+    >>> median_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    0.85
+
+    """
+    y_type, y_true, y_pred, multioutput = _check_reg_targets(
+        y_true, y_pred, multioutput)
+    output_errors = np.median(np.abs(y_pred - y_true), axis=0)
+    if isinstance(multioutput, str):
+        if multioutput == 'raw_values':
+            return output_errors
+        elif multioutput == 'uniform_average':
+            # pass None as weights to np.average: uniform mean
+            multioutput = None
+
+    return np.average(output_errors, weights=multioutput)
+
+
+@_deprecate_positional_args
+def explained_variance_score(y_true, y_pred, *,
+                             sample_weight=None,
+                             multioutput='uniform_average'):
+    """Explained variance regression score function
+
+    Best possible score is 1.0, lower values are worse.
+
+    Read more in the :ref:`User Guide <explained_variance_score>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), optional
+        Sample weights.
+
+    multioutput : string in ['raw_values', 'uniform_average', \
+                'variance_weighted'] or array-like of shape (n_outputs)
+        Defines aggregating of multiple output scores.
+        Array-like value defines weights used to average scores.
+
+        'raw_values' :
+            Returns a full set of scores in case of multioutput input.
+
+        'uniform_average' :
+            Scores of all outputs are averaged with uniform weight.
+
+        'variance_weighted' :
+            Scores of all outputs are averaged, weighted by the variances
+            of each individual output.
+
+    Returns
+    -------
+    score : float or ndarray of floats
+        The explained variance or ndarray if 'multioutput' is 'raw_values'.
+
+    Notes
+    -----
+    This is not a symmetric function.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import explained_variance_score
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> explained_variance_score(y_true, y_pred)
+    0.957...
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> explained_variance_score(y_true, y_pred, multioutput='uniform_average')
+    0.983...
+
+    """
+    y_type, y_true, y_pred, multioutput = _check_reg_targets(
+        y_true, y_pred, multioutput)
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    y_diff_avg = np.average(y_true - y_pred, weights=sample_weight, axis=0)
+    numerator = np.average((y_true - y_pred - y_diff_avg) ** 2,
+                           weights=sample_weight, axis=0)
+
+    y_true_avg = np.average(y_true, weights=sample_weight, axis=0)
+    denominator = np.average((y_true - y_true_avg) ** 2,
+                             weights=sample_weight, axis=0)
+
+    nonzero_numerator = numerator != 0
+    nonzero_denominator = denominator != 0
+    valid_score = nonzero_numerator & nonzero_denominator
+    output_scores = np.ones(y_true.shape[1])
+
+    output_scores[valid_score] = 1 - (numerator[valid_score] /
+                                      denominator[valid_score])
+    output_scores[nonzero_numerator & ~nonzero_denominator] = 0.
+    if isinstance(multioutput, str):
+        if multioutput == 'raw_values':
+            # return scores individually
+            return output_scores
+        elif multioutput == 'uniform_average':
+            # passing to np.average() None as weights results is uniform mean
+            avg_weights = None
+        elif multioutput == 'variance_weighted':
+            avg_weights = denominator
+    else:
+        avg_weights = multioutput
+
+    return np.average(output_scores, weights=avg_weights)
+
+
+@_deprecate_positional_args
+def r2_score(y_true, y_pred, *, sample_weight=None,
+             multioutput="uniform_average"):
+    """R^2 (coefficient of determination) regression score function.
+
+    Best possible score is 1.0 and it can be negative (because the
+    model can be arbitrarily worse). A constant model that always
+    predicts the expected value of y, disregarding the input features,
+    would get a R^2 score of 0.0.
+
+    Read more in the :ref:`User Guide <r2_score>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), optional
+        Sample weights.
+
+    multioutput : string in ['raw_values', 'uniform_average', \
+'variance_weighted'] or None or array-like of shape (n_outputs)
+
+        Defines aggregating of multiple output scores.
+        Array-like value defines weights used to average scores.
+        Default is "uniform_average".
+
+        'raw_values' :
+            Returns a full set of scores in case of multioutput input.
+
+        'uniform_average' :
+            Scores of all outputs are averaged with uniform weight.
+
+        'variance_weighted' :
+            Scores of all outputs are averaged, weighted by the variances
+            of each individual output.
+
+        .. versionchanged:: 0.19
+            Default value of multioutput is 'uniform_average'.
+
+    Returns
+    -------
+    z : float or ndarray of floats
+        The R^2 score or ndarray of scores if 'multioutput' is
+        'raw_values'.
+
+    Notes
+    -----
+    This is not a symmetric function.
+
+    Unlike most other scores, R^2 score may be negative (it need not actually
+    be the square of a quantity R).
+
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry on the Coefficient of determination
+            <https://en.wikipedia.org/wiki/Coefficient_of_determination>`_
+
+    Examples
+    --------
+    >>> from sklearn.metrics import r2_score
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> r2_score(y_true, y_pred)
+    0.948...
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> r2_score(y_true, y_pred,
+    ...          multioutput='variance_weighted')
+    0.938...
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [1, 2, 3]
+    >>> r2_score(y_true, y_pred)
+    1.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [2, 2, 2]
+    >>> r2_score(y_true, y_pred)
+    0.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [3, 2, 1]
+    >>> r2_score(y_true, y_pred)
+    -3.0
+    """
+    y_type, y_true, y_pred, multioutput = _check_reg_targets(
+        y_true, y_pred, multioutput)
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    if _num_samples(y_pred) < 2:
+        msg = "R^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float('nan')
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+        weight = sample_weight[:, np.newaxis]
+    else:
+        weight = 1.
+
+    numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,
+                                                      dtype=np.float64)
+    denominator = (weight * (y_true - np.average(
+        y_true, axis=0, weights=sample_weight)) ** 2).sum(axis=0,
+                                                          dtype=np.float64)
+    nonzero_denominator = denominator != 0
+    nonzero_numerator = numerator != 0
+    valid_score = nonzero_denominator & nonzero_numerator
+    output_scores = np.ones([y_true.shape[1]])
+    output_scores[valid_score] = 1 - (numerator[valid_score] /
+                                      denominator[valid_score])
+    # arbitrary set to zero to avoid -inf scores, having a constant
+    # y_true is not interesting for scoring a regression anyway
+    output_scores[nonzero_numerator & ~nonzero_denominator] = 0.
+    if isinstance(multioutput, str):
+        if multioutput == 'raw_values':
+            # return scores individually
+            return output_scores
+        elif multioutput == 'uniform_average':
+            # passing None as weights results is uniform mean
+            avg_weights = None
+        elif multioutput == 'variance_weighted':
+            avg_weights = denominator
+            # avoid fail on constant y or one-element arrays
+            if not np.any(nonzero_denominator):
+                if not np.any(nonzero_numerator):
+                    return 1.0
+                else:
+                    return 0.0
+    else:
+        avg_weights = multioutput
+
+    return np.average(output_scores, weights=avg_weights)
+
+
+def max_error(y_true, y_pred):
+    """
+    max_error metric calculates the maximum residual error.
+
+    Read more in the :ref:`User Guide <max_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values.
+
+    Returns
+    -------
+    max_error : float
+        A positive floating point value (the best value is 0.0).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import max_error
+    >>> y_true = [3, 2, 7, 1]
+    >>> y_pred = [4, 2, 7, 1]
+    >>> max_error(y_true, y_pred)
+    1
+    """
+    y_type, y_true, y_pred, _ = _check_reg_targets(y_true, y_pred, None)
+    if y_type == 'continuous-multioutput':
+        raise ValueError("Multioutput not supported in max_error")
+    return np.max(np.abs(y_true - y_pred))
+
+
+@_deprecate_positional_args
+def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
+    """Mean Tweedie deviance regression loss.
+
+    Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    power : float, default=0
+        Tweedie power parameter. Either power <= 0 or power >= 1.
+
+        The higher `p` the less weight is given to extreme
+        deviations between true and predicted targets.
+
+        - power < 0: Extreme stable distribution. Requires: y_pred > 0.
+        - power = 0 : Normal distribution, output corresponds to
+          mean_squared_error. y_true and y_pred can be any real numbers.
+        - power = 1 : Poisson distribution. Requires: y_true >= 0 and
+          y_pred > 0.
+        - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0
+          and y_pred > 0.
+        - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.
+        - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0
+          and y_pred > 0.
+        - otherwise : Positive stable distribution. Requires: y_true > 0
+          and y_pred > 0.
+
+    Returns
+    -------
+    loss : float
+        A non-negative floating point value (the best value is 0.0).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_tweedie_deviance
+    >>> y_true = [2, 0, 1, 4]
+    >>> y_pred = [0.5, 0.5, 2., 2.]
+    >>> mean_tweedie_deviance(y_true, y_pred, power=1)
+    1.4260...
+    """
+    y_type, y_true, y_pred, _ = _check_reg_targets(
+        y_true, y_pred, None, dtype=[np.float64, np.float32])
+    if y_type == 'continuous-multioutput':
+        raise ValueError("Multioutput not supported in mean_tweedie_deviance")
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+        sample_weight = sample_weight[:, np.newaxis]
+
+    dist = TweedieDistribution(power=power)
+    dev = dist.unit_deviance(y_true, y_pred, check_input=True)
+
+    return np.average(dev, weights=sample_weight)
+
+
+@_deprecate_positional_args
+def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None):
+    """Mean Poisson deviance regression loss.
+
+    Poisson deviance is equivalent to the Tweedie deviance with
+    the power parameter `power=1`.
+
+    Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values. Requires y_true >= 0.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values. Requires y_pred > 0.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        A non-negative floating point value (the best value is 0.0).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_poisson_deviance
+    >>> y_true = [2, 0, 1, 4]
+    >>> y_pred = [0.5, 0.5, 2., 2.]
+    >>> mean_poisson_deviance(y_true, y_pred)
+    1.4260...
+    """
+    return mean_tweedie_deviance(
+        y_true, y_pred, sample_weight=sample_weight, power=1
+    )
+
+
+@_deprecate_positional_args
+def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None):
+    """Mean Gamma deviance regression loss.
+
+    Gamma deviance is equivalent to the Tweedie deviance with
+    the power parameter `power=2`. It is invariant to scaling of
+    the target variable, and measures relative errors.
+
+    Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values. Requires y_true > 0.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values. Requires y_pred > 0.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        A non-negative floating point value (the best value is 0.0).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_gamma_deviance
+    >>> y_true = [2, 0.5, 1, 4]
+    >>> y_pred = [0.5, 0.5, 2., 2.]
+    >>> mean_gamma_deviance(y_true, y_pred)
+    1.0568...
+    """
+    return mean_tweedie_deviance(
+        y_true, y_pred, sample_weight=sample_weight, power=2
+    )
--- a/venv/Lib/site-packages/sklearn/metrics/_scorer.py
+++ b/venv/Lib/site-packages/sklearn/metrics/_scorer.py
@ -0,0 +1,725 @@
+"""
+The :mod:`sklearn.metrics.scorer` submodule implements a flexible
+interface for model selection and evaluation using
+arbitrary score functions.
+
+A scorer object is a callable that can be passed to
+:class:`sklearn.model_selection.GridSearchCV` or
+:func:`sklearn.model_selection.cross_val_score` as the ``scoring``
+parameter, to specify how a model should be evaluated.
+
+The signature of the call is ``(estimator, X, y)`` where ``estimator``
+is the model to be evaluated, ``X`` is the test data and ``y`` is the
+ground truth labeling (or ``None`` in the case of unsupervised models).
+"""
+
+# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
+#          Lars Buitinck
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+# License: Simplified BSD
+
+from collections.abc import Iterable
+from functools import partial
+from collections import Counter
+import warnings
+
+import numpy as np
+
+from . import (r2_score, median_absolute_error, max_error, mean_absolute_error,
+               mean_squared_error, mean_squared_log_error,
+               mean_poisson_deviance, mean_gamma_deviance, accuracy_score,
+               f1_score, roc_auc_score, average_precision_score,
+               precision_score, recall_score, log_loss,
+               balanced_accuracy_score, explained_variance_score,
+               brier_score_loss, jaccard_score)
+
+from .cluster import adjusted_rand_score
+from .cluster import homogeneity_score
+from .cluster import completeness_score
+from .cluster import v_measure_score
+from .cluster import mutual_info_score
+from .cluster import adjusted_mutual_info_score
+from .cluster import normalized_mutual_info_score
+from .cluster import fowlkes_mallows_score
+
+from ..utils.multiclass import type_of_target
+from ..utils.validation import _deprecate_positional_args
+from ..base import is_regressor
+
+
+def _cached_call(cache, estimator, method, *args, **kwargs):
+    """Call estimator with method and args and kwargs."""
+    if cache is None:
+        return getattr(estimator, method)(*args, **kwargs)
+
+    try:
+        return cache[method]
+    except KeyError:
+        result = getattr(estimator, method)(*args, **kwargs)
+        cache[method] = result
+        return result
+
+
+class _MultimetricScorer:
+    """Callable for multimetric scoring used to avoid repeated calls
+    to `predict_proba`, `predict`, and `decision_function`.
+
+    `_MultimetricScorer` will return a dictionary of scores corresponding to
+    the scorers in the dictionary. Note that `_MultimetricScorer` can be
+    created with a dictionary with one key  (i.e. only one actual scorer).
+
+    Parameters
+    ----------
+    scorers : dict
+        Dictionary mapping names to callable scorers.
+    """
+    def __init__(self, **scorers):
+        self._scorers = scorers
+
+    def __call__(self, estimator, *args, **kwargs):
+        """Evaluate predicted target values."""
+        scores = {}
+        cache = {} if self._use_cache(estimator) else None
+        cached_call = partial(_cached_call, cache)
+
+        for name, scorer in self._scorers.items():
+            if isinstance(scorer, _BaseScorer):
+                score = scorer._score(cached_call, estimator,
+                                      *args, **kwargs)
+            else:
+                score = scorer(estimator, *args, **kwargs)
+            scores[name] = score
+        return scores
+
+    def _use_cache(self, estimator):
+        """Return True if using a cache is beneficial.
+
+        Caching may be beneficial when one of these conditions holds:
+          - `_ProbaScorer` will be called twice.
+          - `_PredictScorer` will be called twice.
+          - `_ThresholdScorer` will be called twice.
+          - `_ThresholdScorer` and `_PredictScorer` are called and
+             estimator is a regressor.
+          - `_ThresholdScorer` and `_ProbaScorer` are called and
+             estimator does not have a `decision_function` attribute.
+
+        """
+        if len(self._scorers) == 1:  # Only one scorer
+            return False
+
+        counter = Counter([type(v) for v in self._scorers.values()])
+
+        if any(counter[known_type] > 1 for known_type in
+               [_PredictScorer, _ProbaScorer, _ThresholdScorer]):
+            return True
+
+        if counter[_ThresholdScorer]:
+            if is_regressor(estimator) and counter[_PredictScorer]:
+                return True
+            elif (counter[_ProbaScorer] and
+                  not hasattr(estimator, "decision_function")):
+                return True
+        return False
+
+
+class _BaseScorer:
+    def __init__(self, score_func, sign, kwargs):
+        self._kwargs = kwargs
+        self._score_func = score_func
+        self._sign = sign
+        # XXX After removing the deprecated scorers (v0.24) remove the
+        # XXX deprecation_msg property again and remove __call__'s body again
+        self._deprecation_msg = None
+
+    def __repr__(self):
+        kwargs_string = "".join([", %s=%s" % (str(k), str(v))
+                                 for k, v in self._kwargs.items()])
+        return ("make_scorer(%s%s%s%s)"
+                % (self._score_func.__name__,
+                   "" if self._sign > 0 else ", greater_is_better=False",
+                   self._factory_args(), kwargs_string))
+
+    def __call__(self, estimator, X, y_true, sample_weight=None):
+        """Evaluate predicted target values for X relative to y_true.
+
+        Parameters
+        ----------
+        estimator : object
+            Trained estimator to use for scoring. Must have a predict_proba
+            method; the output of that is used to compute the score.
+
+        X : array-like or sparse matrix
+            Test data that will be fed to estimator.predict.
+
+        y_true : array-like
+            Gold standard target values for X.
+
+        sample_weight : array-like, optional (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+        if self._deprecation_msg is not None:
+            warnings.warn(self._deprecation_msg,
+                          category=FutureWarning,
+                          stacklevel=2)
+        return self._score(partial(_cached_call, None), estimator, X, y_true,
+                           sample_weight=sample_weight)
+
+    def _factory_args(self):
+        """Return non-default make_scorer arguments for repr."""
+        return ""
+
+
+class _PredictScorer(_BaseScorer):
+    def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
+        """Evaluate predicted target values for X relative to y_true.
+
+        Parameters
+        ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+
+        estimator : object
+            Trained estimator to use for scoring. Must have a predict_proba
+            method; the output of that is used to compute the score.
+
+        X : array-like or sparse matrix
+            Test data that will be fed to estimator.predict.
+
+        y_true : array-like
+            Gold standard target values for X.
+
+        sample_weight : array-like, optional (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+
+        y_pred = method_caller(estimator, "predict", X)
+        if sample_weight is not None:
+            return self._sign * self._score_func(y_true, y_pred,
+                                                 sample_weight=sample_weight,
+                                                 **self._kwargs)
+        else:
+            return self._sign * self._score_func(y_true, y_pred,
+                                                 **self._kwargs)
+
+
+class _ProbaScorer(_BaseScorer):
+    def _score(self, method_caller, clf, X, y, sample_weight=None):
+        """Evaluate predicted probabilities for X relative to y_true.
+
+        Parameters
+        ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+
+        clf : object
+            Trained classifier to use for scoring. Must have a predict_proba
+            method; the output of that is used to compute the score.
+
+        X : array-like or sparse matrix
+            Test data that will be fed to clf.predict_proba.
+
+        y : array-like
+            Gold standard target values for X. These must be class labels,
+            not probabilities.
+
+        sample_weight : array-like, optional (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+
+        y_type = type_of_target(y)
+        y_pred = method_caller(clf, "predict_proba", X)
+        if y_type == "binary":
+            if y_pred.shape[1] == 2:
+                y_pred = y_pred[:, 1]
+            elif y_pred.shape[1] == 1:  # not multiclass
+                raise ValueError('got predict_proba of shape {},'
+                                 ' but need classifier with two'
+                                 ' classes for {} scoring'.format(
+                                     y_pred.shape, self._score_func.__name__))
+        if sample_weight is not None:
+            return self._sign * self._score_func(y, y_pred,
+                                                 sample_weight=sample_weight,
+                                                 **self._kwargs)
+        else:
+            return self._sign * self._score_func(y, y_pred, **self._kwargs)
+
+    def _factory_args(self):
+        return ", needs_proba=True"
+
+
+class _ThresholdScorer(_BaseScorer):
+    def _score(self, method_caller, clf, X, y, sample_weight=None):
+        """Evaluate decision function output for X relative to y_true.
+
+        Parameters
+        ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+
+        clf : object
+            Trained classifier to use for scoring. Must have either a
+            decision_function method or a predict_proba method; the output of
+            that is used to compute the score.
+
+        X : array-like or sparse matrix
+            Test data that will be fed to clf.decision_function or
+            clf.predict_proba.
+
+        y : array-like
+            Gold standard target values for X. These must be class labels,
+            not decision function values.
+
+        sample_weight : array-like, optional (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+
+        y_type = type_of_target(y)
+        if y_type not in ("binary", "multilabel-indicator"):
+            raise ValueError("{0} format is not supported".format(y_type))
+
+        if is_regressor(clf):
+            y_pred = method_caller(clf, "predict", X)
+        else:
+            try:
+                y_pred = method_caller(clf, "decision_function", X)
+
+                # For multi-output multi-class estimator
+                if isinstance(y_pred, list):
+                    y_pred = np.vstack([p for p in y_pred]).T
+
+            except (NotImplementedError, AttributeError):
+                y_pred = method_caller(clf, "predict_proba", X)
+
+                if y_type == "binary":
+                    if y_pred.shape[1] == 2:
+                        y_pred = y_pred[:, 1]
+                    else:
+                        raise ValueError('got predict_proba of shape {},'
+                                         ' but need classifier with two'
+                                         ' classes for {} scoring'.format(
+                                             y_pred.shape,
+                                             self._score_func.__name__))
+                elif isinstance(y_pred, list):
+                    y_pred = np.vstack([p[:, -1] for p in y_pred]).T
+
+        if sample_weight is not None:
+            return self._sign * self._score_func(y, y_pred,
+                                                 sample_weight=sample_weight,
+                                                 **self._kwargs)
+        else:
+            return self._sign * self._score_func(y, y_pred, **self._kwargs)
+
+    def _factory_args(self):
+        return ", needs_threshold=True"
+
+
+def get_scorer(scoring):
+    """Get a scorer from string.
+
+    Read more in the :ref:`User Guide <scoring_parameter>`.
+
+    Parameters
+    ----------
+    scoring : str | callable
+        scoring method as string. If callable it is returned as is.
+
+    Returns
+    -------
+    scorer : callable
+        The scorer.
+    """
+    if isinstance(scoring, str):
+        try:
+            if scoring == 'brier_score_loss':
+                # deprecated
+                scorer = brier_score_loss_scorer
+            else:
+                scorer = SCORERS[scoring]
+        except KeyError:
+            raise ValueError('%r is not a valid scoring value. '
+                             'Use sorted(sklearn.metrics.SCORERS.keys()) '
+                             'to get valid options.' % scoring)
+    else:
+        scorer = scoring
+    return scorer
+
+
+def _passthrough_scorer(estimator, *args, **kwargs):
+    """Function that wraps estimator.score"""
+    return estimator.score(*args, **kwargs)
+
+
+@_deprecate_positional_args
+def check_scoring(estimator, scoring=None, *, allow_none=False):
+    """Determine scorer from user options.
+
+    A TypeError will be thrown if the estimator cannot be scored.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    scoring : string, callable or None, optional, default: None
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    allow_none : boolean, optional, default: False
+        If no scoring is specified and the estimator has no score function, we
+        can either return None or raise an exception.
+
+    Returns
+    -------
+    scoring : callable
+        A scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+    """
+    if not hasattr(estimator, 'fit'):
+        raise TypeError("estimator should be an estimator implementing "
+                        "'fit' method, %r was passed" % estimator)
+    if isinstance(scoring, str):
+        return get_scorer(scoring)
+    elif callable(scoring):
+        # Heuristic to ensure user has not passed a metric
+        module = getattr(scoring, '__module__', None)
+        if hasattr(module, 'startswith') and \
+           module.startswith('sklearn.metrics.') and \
+           not module.startswith('sklearn.metrics._scorer') and \
+           not module.startswith('sklearn.metrics.tests.'):
+            raise ValueError('scoring value %r looks like it is a metric '
+                             'function rather than a scorer. A scorer should '
+                             'require an estimator as its first parameter. '
+                             'Please use `make_scorer` to convert a metric '
+                             'to a scorer.' % scoring)
+        return get_scorer(scoring)
+    elif scoring is None:
+        if hasattr(estimator, 'score'):
+            return _passthrough_scorer
+        elif allow_none:
+            return None
+        else:
+            raise TypeError(
+                "If no scoring is specified, the estimator passed should "
+                "have a 'score' method. The estimator %r does not."
+                % estimator)
+    elif isinstance(scoring, Iterable):
+        raise ValueError("For evaluating multiple scores, use "
+                         "sklearn.model_selection.cross_validate instead. "
+                         "{0} was passed.".format(scoring))
+    else:
+        raise ValueError("scoring value should either be a callable, string or"
+                         " None. %r was passed" % scoring)
+
+
+def _check_multimetric_scoring(estimator, scoring=None):
+    """Check the scoring parameter in cases when multiple metrics are allowed
+
+    Parameters
+    ----------
+    estimator : sklearn estimator instance
+        The estimator for which the scoring will be applied.
+
+    scoring : string, callable, list/tuple, dict or None, default: None
+        A single string (see :ref:`scoring_parameter`) or a callable
+        (see :ref:`scoring`) to evaluate the predictions on the test set.
+
+        For evaluating multiple metrics, either give a list of (unique) strings
+        or a dict with names as keys and callables as values.
+
+        NOTE that when using custom scorers, each scorer should return a single
+        value. Metric functions returning a list/array of values can be wrapped
+        into multiple scorers that return one value each.
+
+        See :ref:`multimetric_grid_search` for an example.
+
+        If None the estimator's score method is used.
+        The return value in that case will be ``{'score': <default_scorer>}``.
+        If the estimator's score method is not available, a ``TypeError``
+        is raised.
+
+    Returns
+    -------
+    scorers_dict : dict
+        A dict mapping each scorer name to its validated scorer.
+
+    is_multimetric : bool
+        True if scorer is a list/tuple or dict of callables
+        False if scorer is None/str/callable
+    """
+    if callable(scoring) or scoring is None or isinstance(scoring,
+                                                          str):
+        scorers = {"score": check_scoring(estimator, scoring=scoring)}
+        return scorers, False
+    else:
+        err_msg_generic = ("scoring should either be a single string or "
+                           "callable for single metric evaluation or a "
+                           "list/tuple of strings or a dict of scorer name "
+                           "mapped to the callable for multiple metric "
+                           "evaluation. Got %s of type %s"
+                           % (repr(scoring), type(scoring)))
+
+        if isinstance(scoring, (list, tuple, set)):
+            err_msg = ("The list/tuple elements must be unique "
+                       "strings of predefined scorers. ")
+            invalid = False
+            try:
+                keys = set(scoring)
+            except TypeError:
+                invalid = True
+            if invalid:
+                raise ValueError(err_msg)
+
+            if len(keys) != len(scoring):
+                raise ValueError(err_msg + "Duplicate elements were found in"
+                                 " the given list. %r" % repr(scoring))
+            elif len(keys) > 0:
+                if not all(isinstance(k, str) for k in keys):
+                    if any(callable(k) for k in keys):
+                        raise ValueError(err_msg +
+                                         "One or more of the elements were "
+                                         "callables. Use a dict of score name "
+                                         "mapped to the scorer callable. "
+                                         "Got %r" % repr(scoring))
+                    else:
+                        raise ValueError(err_msg +
+                                         "Non-string types were found in "
+                                         "the given list. Got %r"
+                                         % repr(scoring))
+                scorers = {scorer: check_scoring(estimator, scoring=scorer)
+                           for scorer in scoring}
+            else:
+                raise ValueError(err_msg +
+                                 "Empty list was given. %r" % repr(scoring))
+
+        elif isinstance(scoring, dict):
+            keys = set(scoring)
+            if not all(isinstance(k, str) for k in keys):
+                raise ValueError("Non-string types were found in the keys of "
+                                 "the given dict. scoring=%r" % repr(scoring))
+            if len(keys) == 0:
+                raise ValueError("An empty dict was passed. %r"
+                                 % repr(scoring))
+            scorers = {key: check_scoring(estimator, scoring=scorer)
+                       for key, scorer in scoring.items()}
+        else:
+            raise ValueError(err_msg_generic)
+        return scorers, True
+
+
+@_deprecate_positional_args
+def make_scorer(score_func, *, greater_is_better=True, needs_proba=False,
+                needs_threshold=False, **kwargs):
+    """Make a scorer from a performance metric or loss function.
+
+    This factory function wraps scoring functions for use in GridSearchCV
+    and cross_val_score. It takes a score function, such as ``accuracy_score``,
+    ``mean_squared_error``, ``adjusted_rand_index`` or ``average_precision``
+    and returns a callable that scores an estimator's output.
+
+    Read more in the :ref:`User Guide <scoring>`.
+
+    Parameters
+    ----------
+    score_func : callable,
+        Score function (or loss function) with signature
+        ``score_func(y, y_pred, **kwargs)``.
+
+    greater_is_better : boolean, default=True
+        Whether score_func is a score function (default), meaning high is good,
+        or a loss function, meaning low is good. In the latter case, the
+        scorer object will sign-flip the outcome of the score_func.
+
+    needs_proba : boolean, default=False
+        Whether score_func requires predict_proba to get probability estimates
+        out of a classifier.
+
+        If True, for binary `y_true`, the score function is supposed to accept
+        a 1D `y_pred` (i.e., probability of the positive class, shape
+        `(n_samples,)`).
+
+    needs_threshold : boolean, default=False
+        Whether score_func takes a continuous decision certainty.
+        This only works for binary classification using estimators that
+        have either a decision_function or predict_proba method.
+
+        If True, for binary `y_true`, the score function is supposed to accept
+        a 1D `y_pred` (i.e., probability of the positive class or the decision
+        function, shape `(n_samples,)`).
+
+        For example ``average_precision`` or the area under the roc curve
+        can not be computed using discrete predictions alone.
+
+    **kwargs : additional arguments
+        Additional parameters to be passed to score_func.
+
+    Returns
+    -------
+    scorer : callable
+        Callable object that returns a scalar score; greater is better.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import fbeta_score, make_scorer
+    >>> ftwo_scorer = make_scorer(fbeta_score, beta=2)
+    >>> ftwo_scorer
+    make_scorer(fbeta_score, beta=2)
+    >>> from sklearn.model_selection import GridSearchCV
+    >>> from sklearn.svm import LinearSVC
+    >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]},
+    ...                     scoring=ftwo_scorer)
+
+    Notes
+    -----
+    If `needs_proba=False` and `needs_threshold=False`, the score
+    function is supposed to accept the output of :term:`predict`. If
+    `needs_proba=True`, the score function is supposed to accept the
+    output of :term:`predict_proba` (For binary `y_true`, the score function is
+    supposed to accept probability of the positive class). If
+    `needs_threshold=True`, the score function is supposed to accept the
+    output of :term:`decision_function`.
+    """
+    sign = 1 if greater_is_better else -1
+    if needs_proba and needs_threshold:
+        raise ValueError("Set either needs_proba or needs_threshold to True,"
+                         " but not both.")
+    if needs_proba:
+        cls = _ProbaScorer
+    elif needs_threshold:
+        cls = _ThresholdScorer
+    else:
+        cls = _PredictScorer
+    return cls(score_func, sign, kwargs)
+
+
+# Standard regression scores
+explained_variance_scorer = make_scorer(explained_variance_score)
+r2_scorer = make_scorer(r2_score)
+max_error_scorer = make_scorer(max_error,
+                               greater_is_better=False)
+neg_mean_squared_error_scorer = make_scorer(mean_squared_error,
+                                            greater_is_better=False)
+neg_mean_squared_log_error_scorer = make_scorer(mean_squared_log_error,
+                                                greater_is_better=False)
+neg_mean_absolute_error_scorer = make_scorer(mean_absolute_error,
+                                             greater_is_better=False)
+neg_median_absolute_error_scorer = make_scorer(median_absolute_error,
+                                               greater_is_better=False)
+neg_root_mean_squared_error_scorer = make_scorer(mean_squared_error,
+                                                 greater_is_better=False,
+                                                 squared=False)
+neg_mean_poisson_deviance_scorer = make_scorer(
+    mean_poisson_deviance, greater_is_better=False
+)
+
+neg_mean_gamma_deviance_scorer = make_scorer(
+    mean_gamma_deviance, greater_is_better=False
+)
+
+# Standard Classification Scores
+accuracy_scorer = make_scorer(accuracy_score)
+balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)
+
+# Score functions that need decision values
+roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
+                             needs_threshold=True)
+average_precision_scorer = make_scorer(average_precision_score,
+                                       needs_threshold=True)
+roc_auc_ovo_scorer = make_scorer(roc_auc_score, needs_proba=True,
+                                 multi_class='ovo')
+roc_auc_ovo_weighted_scorer = make_scorer(roc_auc_score, needs_proba=True,
+                                          multi_class='ovo',
+                                          average='weighted')
+roc_auc_ovr_scorer = make_scorer(roc_auc_score, needs_proba=True,
+                                 multi_class='ovr')
+roc_auc_ovr_weighted_scorer = make_scorer(roc_auc_score, needs_proba=True,
+                                          multi_class='ovr',
+                                          average='weighted')
+
+# Score function for probabilistic classification
+neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False,
+                                  needs_proba=True)
+neg_brier_score_scorer = make_scorer(brier_score_loss,
+                                     greater_is_better=False,
+                                     needs_proba=True)
+brier_score_loss_scorer = make_scorer(brier_score_loss,
+                                      greater_is_better=False,
+                                      needs_proba=True)
+deprecation_msg = ('Scoring method brier_score_loss was renamed to '
+                   'neg_brier_score in version 0.22 and will '
+                   'be removed in 0.24.')
+brier_score_loss_scorer._deprecation_msg = deprecation_msg
+
+
+# Clustering scores
+adjusted_rand_scorer = make_scorer(adjusted_rand_score)
+homogeneity_scorer = make_scorer(homogeneity_score)
+completeness_scorer = make_scorer(completeness_score)
+v_measure_scorer = make_scorer(v_measure_score)
+mutual_info_scorer = make_scorer(mutual_info_score)
+adjusted_mutual_info_scorer = make_scorer(adjusted_mutual_info_score)
+normalized_mutual_info_scorer = make_scorer(normalized_mutual_info_score)
+fowlkes_mallows_scorer = make_scorer(fowlkes_mallows_score)
+
+
+SCORERS = dict(explained_variance=explained_variance_scorer,
+               r2=r2_scorer,
+               max_error=max_error_scorer,
+               neg_median_absolute_error=neg_median_absolute_error_scorer,
+               neg_mean_absolute_error=neg_mean_absolute_error_scorer,
+               neg_mean_squared_error=neg_mean_squared_error_scorer,
+               neg_mean_squared_log_error=neg_mean_squared_log_error_scorer,
+               neg_root_mean_squared_error=neg_root_mean_squared_error_scorer,
+               neg_mean_poisson_deviance=neg_mean_poisson_deviance_scorer,
+               neg_mean_gamma_deviance=neg_mean_gamma_deviance_scorer,
+               accuracy=accuracy_scorer, roc_auc=roc_auc_scorer,
+               roc_auc_ovr=roc_auc_ovr_scorer,
+               roc_auc_ovo=roc_auc_ovo_scorer,
+               roc_auc_ovr_weighted=roc_auc_ovr_weighted_scorer,
+               roc_auc_ovo_weighted=roc_auc_ovo_weighted_scorer,
+               balanced_accuracy=balanced_accuracy_scorer,
+               average_precision=average_precision_scorer,
+               neg_log_loss=neg_log_loss_scorer,
+               neg_brier_score=neg_brier_score_scorer,
+               # Cluster metrics that use supervised evaluation
+               adjusted_rand_score=adjusted_rand_scorer,
+               homogeneity_score=homogeneity_scorer,
+               completeness_score=completeness_scorer,
+               v_measure_score=v_measure_scorer,
+               mutual_info_score=mutual_info_scorer,
+               adjusted_mutual_info_score=adjusted_mutual_info_scorer,
+               normalized_mutual_info_score=normalized_mutual_info_scorer,
+               fowlkes_mallows_score=fowlkes_mallows_scorer)
+
+
+for name, metric in [('precision', precision_score),
+                     ('recall', recall_score), ('f1', f1_score),
+                     ('jaccard', jaccard_score)]:
+    SCORERS[name] = make_scorer(metric, average='binary')
+    for average in ['macro', 'micro', 'samples', 'weighted']:
+        qualified_name = '{0}_{1}'.format(name, average)
+        SCORERS[qualified_name] = make_scorer(metric, pos_label=None,
+                                              average=average)
--- a/venv/Lib/site-packages/sklearn/metrics/base.py
+++ b/venv/Lib/site-packages/sklearn/metrics/base.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _base  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.base'
+correct_import_path = 'sklearn.metrics'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_base, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/classification.py
+++ b/venv/Lib/site-packages/sklearn/metrics/classification.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _classification  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.classification'
+correct_import_path = 'sklearn.metrics'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_classification, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/init.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/init.py
@ -0,0 +1,32 @@
+"""
+The :mod:`sklearn.metrics.cluster` submodule contains evaluation metrics for
+cluster analysis results. There are two forms of evaluation:
+
+- supervised, which uses a ground truth class values for each sample.
+- unsupervised, which does not and measures the 'quality' of the model itself.
+"""
+from ._supervised import adjusted_mutual_info_score
+from ._supervised import normalized_mutual_info_score
+from ._supervised import adjusted_rand_score
+from ._supervised import completeness_score
+from ._supervised import contingency_matrix
+from ._supervised import expected_mutual_information
+from ._supervised import homogeneity_completeness_v_measure
+from ._supervised import homogeneity_score
+from ._supervised import mutual_info_score
+from ._supervised import v_measure_score
+from ._supervised import fowlkes_mallows_score
+from ._supervised import entropy
+from ._unsupervised import silhouette_samples
+from ._unsupervised import silhouette_score
+from ._unsupervised import calinski_harabasz_score
+from ._unsupervised import davies_bouldin_score
+from ._bicluster import consensus_score
+
+__all__ = ["adjusted_mutual_info_score", "normalized_mutual_info_score",
+           "adjusted_rand_score", "completeness_score", "contingency_matrix",
+           "expected_mutual_information", "homogeneity_completeness_v_measure",
+           "homogeneity_score", "mutual_info_score", "v_measure_score",
+           "fowlkes_mallows_score", "entropy", "silhouette_samples",
+           "silhouette_score", "calinski_harabasz_score",
+           "davies_bouldin_score", "consensus_score"]
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/_bicluster.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/_bicluster.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/_supervised.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/_supervised.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/_unsupervised.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/_unsupervised.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/bicluster.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/bicluster.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/expected_mutual_info_fast.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/expected_mutual_info_fast.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/setup.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/setup.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/supervised.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/supervised.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/unsupervised.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/pycache/unsupervised.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/_bicluster.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/_bicluster.py
@ -0,0 +1,86 @@
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+from ...utils.validation import check_consistent_length, check_array
+from ...utils.validation import _deprecate_positional_args
+
+__all__ = ["consensus_score"]
+
+
+def _check_rows_and_columns(a, b):
+    """Unpacks the row and column arrays and checks their shape."""
+    check_consistent_length(*a)
+    check_consistent_length(*b)
+    checks = lambda x: check_array(x, ensure_2d=False)
+    a_rows, a_cols = map(checks, a)
+    b_rows, b_cols = map(checks, b)
+    return a_rows, a_cols, b_rows, b_cols
+
+
+def _jaccard(a_rows, a_cols, b_rows, b_cols):
+    """Jaccard coefficient on the elements of the two biclusters."""
+    intersection = ((a_rows * b_rows).sum() *
+                    (a_cols * b_cols).sum())
+
+    a_size = a_rows.sum() * a_cols.sum()
+    b_size = b_rows.sum() * b_cols.sum()
+
+    return intersection / (a_size + b_size - intersection)
+
+
+def _pairwise_similarity(a, b, similarity):
+    """Computes pairwise similarity matrix.
+
+    result[i, j] is the Jaccard coefficient of a's bicluster i and b's
+    bicluster j.
+
+    """
+    a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b)
+    n_a = a_rows.shape[0]
+    n_b = b_rows.shape[0]
+    result = np.array(list(list(similarity(a_rows[i], a_cols[i],
+                                           b_rows[j], b_cols[j])
+                                for j in range(n_b))
+                           for i in range(n_a)))
+    return result
+
+
+@_deprecate_positional_args
+def consensus_score(a, b, *, similarity="jaccard"):
+    """The similarity of two sets of biclusters.
+
+    Similarity between individual biclusters is computed. Then the
+    best matching between sets is found using the Hungarian algorithm.
+    The final score is the sum of similarities divided by the size of
+    the larger set.
+
+    Read more in the :ref:`User Guide <biclustering>`.
+
+    Parameters
+    ----------
+    a : (rows, columns)
+        Tuple of row and column indicators for a set of biclusters.
+
+    b : (rows, columns)
+        Another set of biclusters like ``a``.
+
+    similarity : string or function, optional, default: "jaccard"
+        May be the string "jaccard" to use the Jaccard coefficient, or
+        any function that takes four arguments, each of which is a 1d
+        indicator vector: (a_rows, a_columns, b_rows, b_columns).
+
+    References
+    ----------
+
+    * Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis
+      for bicluster acquisition
+      <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2881408/>`__.
+
+    """
+    if similarity == "jaccard":
+        similarity = _jaccard
+    matrix = _pairwise_similarity(a, b, similarity)
+    row_indices, col_indices = linear_sum_assignment(1. - matrix)
+    n_a = len(a[0])
+    n_b = len(b[0])
+    return matrix[row_indices, col_indices].sum() / max(n_a, n_b)
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/_expected_mutual_info_fast.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/_expected_mutual_info_fast.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/_supervised.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/_supervised.py
@ -0,0 +1,980 @@
+"""Utilities to evaluate the clustering performance of models.
+
+Functions named as *_score return a scalar value to maximize: the higher the
+better.
+"""
+
+# Authors: Olivier Grisel <olivier.grisel@ensta.org>
+#          Wei LI <kuantkid@gmail.com>
+#          Diego Molla <dmolla-aliod@gmail.com>
+#          Arnaud Fouchet <foucheta@gmail.com>
+#          Thierry Guillemot <thierry.guillemot.work@gmail.com>
+#          Gregory Stupp <stuppie@gmail.com>
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Arya McCarthy <arya@jhu.edu>
+# License: BSD 3 clause
+
+
+from math import log
+
+import numpy as np
+from scipy import sparse as sp
+from scipy.special import comb
+
+from ._expected_mutual_info_fast import expected_mutual_information
+from ...utils.validation import check_array, check_consistent_length
+from ...utils.validation import _deprecate_positional_args
+from ...utils.fixes import _astype_copy_false
+
+
+def _comb2(n):
+    # the exact version is faster for k == 2: use it by default globally in
+    # this module instead of the float approximate variant
+    return comb(n, 2, exact=1)
+
+
+def check_clusterings(labels_true, labels_pred):
+    """Check that the labels arrays are 1D and of same dimension.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,)
+        The true labels.
+
+    labels_pred : array-like of shape (n_samples,)
+        The predicted labels.
+    """
+    labels_true = check_array(
+        labels_true, ensure_2d=False, ensure_min_samples=0, dtype=None,
+    )
+    labels_pred = check_array(
+        labels_pred, ensure_2d=False, ensure_min_samples=0, dtype=None,
+    )
+
+    # input checks
+    if labels_true.ndim != 1:
+        raise ValueError(
+            "labels_true must be 1D: shape is %r" % (labels_true.shape,))
+    if labels_pred.ndim != 1:
+        raise ValueError(
+            "labels_pred must be 1D: shape is %r" % (labels_pred.shape,))
+    check_consistent_length(labels_true, labels_pred)
+
+    return labels_true, labels_pred
+
+
+def _generalized_average(U, V, average_method):
+    """Return a particular mean of two numbers."""
+    if average_method == "min":
+        return min(U, V)
+    elif average_method == "geometric":
+        return np.sqrt(U * V)
+    elif average_method == "arithmetic":
+        return np.mean([U, V])
+    elif average_method == "max":
+        return max(U, V)
+    else:
+        raise ValueError("'average_method' must be 'min', 'geometric', "
+                         "'arithmetic', or 'max'")
+
+
+@_deprecate_positional_args
+def contingency_matrix(labels_true, labels_pred, *, eps=None, sparse=False):
+    """Build a contingency matrix describing the relationship between labels.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = [n_samples]
+        Ground truth class labels to be used as a reference
+
+    labels_pred : array-like of shape (n_samples,)
+        Cluster labels to evaluate
+
+    eps : None or float, optional.
+        If a float, that value is added to all values in the contingency
+        matrix. This helps to stop NaN propagation.
+        If ``None``, nothing is adjusted.
+
+    sparse : boolean, optional.
+        If True, return a sparse CSR continency matrix. If ``eps is not None``,
+        and ``sparse is True``, will throw ValueError.
+
+        .. versionadded:: 0.18
+
+    Returns
+    -------
+    contingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred]
+        Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in
+        true class :math:`i` and in predicted class :math:`j`. If
+        ``eps is None``, the dtype of this array will be integer. If ``eps`` is
+        given, the dtype will be float.
+        Will be a ``scipy.sparse.csr_matrix`` if ``sparse=True``.
+    """
+
+    if eps is not None and sparse:
+        raise ValueError("Cannot set 'eps' when sparse=True")
+
+    classes, class_idx = np.unique(labels_true, return_inverse=True)
+    clusters, cluster_idx = np.unique(labels_pred, return_inverse=True)
+    n_classes = classes.shape[0]
+    n_clusters = clusters.shape[0]
+    # Using coo_matrix to accelerate simple histogram calculation,
+    # i.e. bins are consecutive integers
+    # Currently, coo_matrix is faster than histogram2d for simple cases
+    contingency = sp.coo_matrix((np.ones(class_idx.shape[0]),
+                                 (class_idx, cluster_idx)),
+                                shape=(n_classes, n_clusters),
+                                dtype=np.int)
+    if sparse:
+        contingency = contingency.tocsr()
+        contingency.sum_duplicates()
+    else:
+        contingency = contingency.toarray()
+        if eps is not None:
+            # don't use += as contingency is integer
+            contingency = contingency + eps
+    return contingency
+
+
+# clustering measures
+
+def adjusted_rand_score(labels_true, labels_pred):
+    """Rand index adjusted for chance.
+
+    The Rand Index computes a similarity measure between two clusterings
+    by considering all pairs of samples and counting pairs that are
+    assigned in the same or different clusters in the predicted and
+    true clusterings.
+
+    The raw RI score is then "adjusted for chance" into the ARI score
+    using the following scheme::
+
+        ARI = (RI - Expected_RI) / (max(RI) - Expected_RI)
+
+    The adjusted Rand index is thus ensured to have a value close to
+    0.0 for random labeling independently of the number of clusters and
+    samples and exactly 1.0 when the clusterings are identical (up to
+    a permutation).
+
+    ARI is a symmetric measure::
+
+        adjusted_rand_score(a, b) == adjusted_rand_score(b, a)
+
+    Read more in the :ref:`User Guide <adjusted_rand_score>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = [n_samples]
+        Ground truth class labels to be used as a reference
+
+    labels_pred : array-like of shape (n_samples,)
+        Cluster labels to evaluate
+
+    Returns
+    -------
+    ari : float
+       Similarity score between -1.0 and 1.0. Random labelings have an ARI
+       close to 0.0. 1.0 stands for perfect match.
+
+    Examples
+    --------
+
+    Perfectly matching labelings have a score of 1 even
+
+      >>> from sklearn.metrics.cluster import adjusted_rand_score
+      >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1])
+      1.0
+      >>> adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    Labelings that assign all classes members to the same clusters
+    are complete be not always pure, hence penalized::
+
+      >>> adjusted_rand_score([0, 0, 1, 2], [0, 0, 1, 1])
+      0.57...
+
+    ARI is symmetric, so labelings that have pure clusters with members
+    coming from the same classes but unnecessary splits are penalized::
+
+      >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 2])
+      0.57...
+
+    If classes members are completely split across different clusters, the
+    assignment is totally incomplete, hence the ARI is very low::
+
+      >>> adjusted_rand_score([0, 0, 0, 0], [0, 1, 2, 3])
+      0.0
+
+    References
+    ----------
+
+    .. [Hubert1985] L. Hubert and P. Arabie, Comparing Partitions,
+      Journal of Classification 1985
+      https://link.springer.com/article/10.1007%2FBF01908075
+
+    .. [wk] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index
+
+    See also
+    --------
+    adjusted_mutual_info_score: Adjusted Mutual Information
+
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+    n_samples = labels_true.shape[0]
+    n_classes = np.unique(labels_true).shape[0]
+    n_clusters = np.unique(labels_pred).shape[0]
+
+    # Special limit cases: no clustering since the data is not split;
+    # or trivial clustering where each document is assigned a unique cluster.
+    # These are perfect matches hence return 1.0.
+    if (n_classes == n_clusters == 1 or
+            n_classes == n_clusters == 0 or
+            n_classes == n_clusters == n_samples):
+        return 1.0
+
+    # Compute the ARI using the contingency data
+    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+    sum_comb_c = sum(_comb2(n_c) for n_c in np.ravel(contingency.sum(axis=1)))
+    sum_comb_k = sum(_comb2(n_k) for n_k in np.ravel(contingency.sum(axis=0)))
+    sum_comb = sum(_comb2(n_ij) for n_ij in contingency.data)
+
+    prod_comb = (sum_comb_c * sum_comb_k) / _comb2(n_samples)
+    mean_comb = (sum_comb_k + sum_comb_c) / 2.
+    return (sum_comb - prod_comb) / (mean_comb - prod_comb)
+
+
+@_deprecate_positional_args
+def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0):
+    """Compute the homogeneity and completeness and V-Measure scores at once.
+
+    Those metrics are based on normalized conditional entropy measures of
+    the clustering labeling to evaluate given the knowledge of a Ground
+    Truth class labels of the same samples.
+
+    A clustering result satisfies homogeneity if all of its clusters
+    contain only data points which are members of a single class.
+
+    A clustering result satisfies completeness if all the data points
+    that are members of a given class are elements of the same cluster.
+
+    Both scores have positive values between 0.0 and 1.0, larger values
+    being desirable.
+
+    Those 3 metrics are independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score values in any way.
+
+    V-Measure is furthermore symmetric: swapping ``labels_true`` and
+    ``label_pred`` will give the same score. This does not hold for
+    homogeneity and completeness. V-Measure is identical to
+    :func:`normalized_mutual_info_score` with the arithmetic averaging
+    method.
+
+    Read more in the :ref:`User Guide <homogeneity_completeness>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = [n_samples]
+        ground truth class labels to be used as a reference
+
+    labels_pred : array-like of shape (n_samples,)
+        cluster labels to evaluate
+
+    beta : float
+        Ratio of weight attributed to ``homogeneity`` vs ``completeness``.
+        If ``beta`` is greater than 1, ``completeness`` is weighted more
+        strongly in the calculation. If ``beta`` is less than 1,
+        ``homogeneity`` is weighted more strongly.
+
+    Returns
+    -------
+    homogeneity : float
+       score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling
+
+    completeness : float
+       score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
+
+    v_measure : float
+        harmonic mean of the first two
+
+    See also
+    --------
+    homogeneity_score
+    completeness_score
+    v_measure_score
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+
+    if len(labels_true) == 0:
+        return 1.0, 1.0, 1.0
+
+    entropy_C = entropy(labels_true)
+    entropy_K = entropy(labels_pred)
+
+    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+    MI = mutual_info_score(None, None, contingency=contingency)
+
+    homogeneity = MI / (entropy_C) if entropy_C else 1.0
+    completeness = MI / (entropy_K) if entropy_K else 1.0
+
+    if homogeneity + completeness == 0.0:
+        v_measure_score = 0.0
+    else:
+        v_measure_score = ((1 + beta) * homogeneity * completeness
+                           / (beta * homogeneity + completeness))
+
+    return homogeneity, completeness, v_measure_score
+
+
+def homogeneity_score(labels_true, labels_pred):
+    """Homogeneity metric of a cluster labeling given a ground truth.
+
+    A clustering result satisfies homogeneity if all of its clusters
+    contain only data points which are members of a single class.
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is not symmetric: switching ``label_true`` with ``label_pred``
+    will return the :func:`completeness_score` which will be different in
+    general.
+
+    Read more in the :ref:`User Guide <homogeneity_completeness>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = [n_samples]
+        ground truth class labels to be used as a reference
+
+    labels_pred : array-like of shape (n_samples,)
+        cluster labels to evaluate
+
+    Returns
+    -------
+    homogeneity : float
+       score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling
+
+    References
+    ----------
+
+    .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A
+       conditional entropy-based external cluster evaluation measure
+       <https://aclweb.org/anthology/D/D07/D07-1043.pdf>`_
+
+    See also
+    --------
+    completeness_score
+    v_measure_score
+
+    Examples
+    --------
+
+    Perfect labelings are homogeneous::
+
+      >>> from sklearn.metrics.cluster import homogeneity_score
+      >>> homogeneity_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    Non-perfect labelings that further split classes into more clusters can be
+    perfectly homogeneous::
+
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2]))
+      1.000000
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3]))
+      1.000000
+
+    Clusters that include samples from different classes do not make for an
+    homogeneous labeling::
+
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 0, 1]))
+      0.0...
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 0, 0]))
+      0.0...
+
+    """
+    return homogeneity_completeness_v_measure(labels_true, labels_pred)[0]
+
+
+def completeness_score(labels_true, labels_pred):
+    """Completeness metric of a cluster labeling given a ground truth.
+
+    A clustering result satisfies completeness if all the data points
+    that are members of a given class are elements of the same cluster.
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is not symmetric: switching ``label_true`` with ``label_pred``
+    will return the :func:`homogeneity_score` which will be different in
+    general.
+
+    Read more in the :ref:`User Guide <homogeneity_completeness>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = [n_samples]
+        ground truth class labels to be used as a reference
+
+    labels_pred : array-like of shape (n_samples,)
+        cluster labels to evaluate
+
+    Returns
+    -------
+    completeness : float
+       score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
+
+    References
+    ----------
+
+    .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A
+       conditional entropy-based external cluster evaluation measure
+       <https://aclweb.org/anthology/D/D07/D07-1043.pdf>`_
+
+    See also
+    --------
+    homogeneity_score
+    v_measure_score
+
+    Examples
+    --------
+
+    Perfect labelings are complete::
+
+      >>> from sklearn.metrics.cluster import completeness_score
+      >>> completeness_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    Non-perfect labelings that assign all classes members to the same clusters
+    are still complete::
+
+      >>> print(completeness_score([0, 0, 1, 1], [0, 0, 0, 0]))
+      1.0
+      >>> print(completeness_score([0, 1, 2, 3], [0, 0, 1, 1]))
+      0.999...
+
+    If classes members are split across different clusters, the
+    assignment cannot be complete::
+
+      >>> print(completeness_score([0, 0, 1, 1], [0, 1, 0, 1]))
+      0.0
+      >>> print(completeness_score([0, 0, 0, 0], [0, 1, 2, 3]))
+      0.0
+
+    """
+    return homogeneity_completeness_v_measure(labels_true, labels_pred)[1]
+
+
+@_deprecate_positional_args
+def v_measure_score(labels_true, labels_pred, *, beta=1.0):
+    """V-measure cluster labeling given a ground truth.
+
+    This score is identical to :func:`normalized_mutual_info_score` with
+    the ``'arithmetic'`` option for averaging.
+
+    The V-measure is the harmonic mean between homogeneity and completeness::
+
+        v = (1 + beta) * homogeneity * completeness
+             / (beta * homogeneity + completeness)
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is furthermore symmetric: switching ``label_true`` with
+    ``label_pred`` will return the same score value. This can be useful to
+    measure the agreement of two independent label assignments strategies
+    on the same dataset when the real ground truth is not known.
+
+
+    Read more in the :ref:`User Guide <homogeneity_completeness>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = [n_samples]
+        ground truth class labels to be used as a reference
+
+    labels_pred : array-like of shape (n_samples,)
+        cluster labels to evaluate
+
+    beta : float
+        Ratio of weight attributed to ``homogeneity`` vs ``completeness``.
+        If ``beta`` is greater than 1, ``completeness`` is weighted more
+        strongly in the calculation. If ``beta`` is less than 1,
+        ``homogeneity`` is weighted more strongly.
+
+    Returns
+    -------
+    v_measure : float
+       score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
+
+    References
+    ----------
+
+    .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A
+       conditional entropy-based external cluster evaluation measure
+       <https://aclweb.org/anthology/D/D07/D07-1043.pdf>`_
+
+    See also
+    --------
+    homogeneity_score
+    completeness_score
+    normalized_mutual_info_score
+
+    Examples
+    --------
+
+    Perfect labelings are both homogeneous and complete, hence have score 1.0::
+
+      >>> from sklearn.metrics.cluster import v_measure_score
+      >>> v_measure_score([0, 0, 1, 1], [0, 0, 1, 1])
+      1.0
+      >>> v_measure_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    Labelings that assign all classes members to the same clusters
+    are complete be not homogeneous, hence penalized::
+
+      >>> print("%.6f" % v_measure_score([0, 0, 1, 2], [0, 0, 1, 1]))
+      0.8...
+      >>> print("%.6f" % v_measure_score([0, 1, 2, 3], [0, 0, 1, 1]))
+      0.66...
+
+    Labelings that have pure clusters with members coming from the same
+    classes are homogeneous but un-necessary splits harms completeness
+    and thus penalize V-measure as well::
+
+      >>> print("%.6f" % v_measure_score([0, 0, 1, 1], [0, 0, 1, 2]))
+      0.8...
+      >>> print("%.6f" % v_measure_score([0, 0, 1, 1], [0, 1, 2, 3]))
+      0.66...
+
+    If classes members are completely split across different clusters,
+    the assignment is totally incomplete, hence the V-Measure is null::
+
+      >>> print("%.6f" % v_measure_score([0, 0, 0, 0], [0, 1, 2, 3]))
+      0.0...
+
+    Clusters that include samples from totally different classes totally
+    destroy the homogeneity of the labeling, hence::
+
+      >>> print("%.6f" % v_measure_score([0, 0, 1, 1], [0, 0, 0, 0]))
+      0.0...
+
+    """
+    return homogeneity_completeness_v_measure(labels_true, labels_pred,
+                                              beta=beta)[2]
+
+
+@_deprecate_positional_args
+def mutual_info_score(labels_true, labels_pred, *, contingency=None):
+    """Mutual Information between two clusterings.
+
+    The Mutual Information is a measure of the similarity between two labels of
+    the same data. Where :math:`|U_i|` is the number of the samples
+    in cluster :math:`U_i` and :math:`|V_j|` is the number of the
+    samples in cluster :math:`V_j`, the Mutual Information
+    between clusterings :math:`U` and :math:`V` is given as:
+
+    .. math::
+
+        MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N}
+        \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|}
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is furthermore symmetric: switching ``label_true`` with
+    ``label_pred`` will return the same score value. This can be useful to
+    measure the agreement of two independent label assignments strategies
+    on the same dataset when the real ground truth is not known.
+
+    Read more in the :ref:`User Guide <mutual_info_score>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = [n_samples]
+        A clustering of the data into disjoint subsets.
+
+    labels_pred : int array-like of shape (n_samples,)
+        A clustering of the data into disjoint subsets.
+
+    contingency : {None, array, sparse matrix}, \
+                  shape = [n_classes_true, n_classes_pred]
+        A contingency matrix given by the :func:`contingency_matrix` function.
+        If value is ``None``, it will be computed, otherwise the given value is
+        used, with ``labels_true`` and ``labels_pred`` ignored.
+
+    Returns
+    -------
+    mi : float
+       Mutual information, a non-negative value
+
+    Notes
+    -----
+    The logarithm used is the natural logarithm (base-e).
+
+    See also
+    --------
+    adjusted_mutual_info_score: Adjusted against chance Mutual Information
+    normalized_mutual_info_score: Normalized Mutual Information
+    """
+    if contingency is None:
+        labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+        contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+    else:
+        contingency = check_array(contingency,
+                                  accept_sparse=['csr', 'csc', 'coo'],
+                                  dtype=[int, np.int32, np.int64])
+
+    if isinstance(contingency, np.ndarray):
+        # For an array
+        nzx, nzy = np.nonzero(contingency)
+        nz_val = contingency[nzx, nzy]
+    elif sp.issparse(contingency):
+        # For a sparse matrix
+        nzx, nzy, nz_val = sp.find(contingency)
+    else:
+        raise ValueError("Unsupported type for 'contingency': %s" %
+                         type(contingency))
+
+    contingency_sum = contingency.sum()
+    pi = np.ravel(contingency.sum(axis=1))
+    pj = np.ravel(contingency.sum(axis=0))
+    log_contingency_nm = np.log(nz_val)
+    contingency_nm = nz_val / contingency_sum
+    # Don't need to calculate the full outer product, just for non-zeroes
+    outer = (pi.take(nzx).astype(np.int64, copy=False)
+             * pj.take(nzy).astype(np.int64, copy=False))
+    log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())
+    mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) +
+          contingency_nm * log_outer)
+    return np.clip(mi.sum(), 0.0, None)
+
+
+@_deprecate_positional_args
+def adjusted_mutual_info_score(labels_true, labels_pred, *,
+                               average_method='arithmetic'):
+    """Adjusted Mutual Information between two clusterings.
+
+    Adjusted Mutual Information (AMI) is an adjustment of the Mutual
+    Information (MI) score to account for chance. It accounts for the fact that
+    the MI is generally higher for two clusterings with a larger number of
+    clusters, regardless of whether there is actually more information shared.
+    For two clusterings :math:`U` and :math:`V`, the AMI is given as::
+
+        AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is furthermore symmetric: switching ``label_true`` with
+    ``label_pred`` will return the same score value. This can be useful to
+    measure the agreement of two independent label assignments strategies
+    on the same dataset when the real ground truth is not known.
+
+    Be mindful that this function is an order of magnitude slower than other
+    metrics, such as the Adjusted Rand Index.
+
+    Read more in the :ref:`User Guide <mutual_info_score>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = [n_samples]
+        A clustering of the data into disjoint subsets.
+
+    labels_pred : int array-like of shape (n_samples,)
+        A clustering of the data into disjoint subsets.
+
+    average_method : string, optional (default: 'arithmetic')
+        How to compute the normalizer in the denominator. Possible options
+        are 'min', 'geometric', 'arithmetic', and 'max'.
+
+        .. versionadded:: 0.20
+
+        .. versionchanged:: 0.22
+           The default value of ``average_method`` changed from 'max' to
+           'arithmetic'.
+
+    Returns
+    -------
+    ami: float (upperlimited by 1.0)
+       The AMI returns a value of 1 when the two partitions are identical
+       (ie perfectly matched). Random partitions (independent labellings) have
+       an expected AMI around 0 on average hence can be negative.
+
+    See also
+    --------
+    adjusted_rand_score: Adjusted Rand Index
+    mutual_info_score: Mutual Information (not adjusted for chance)
+
+    Examples
+    --------
+
+    Perfect labelings are both homogeneous and complete, hence have
+    score 1.0::
+
+      >>> from sklearn.metrics.cluster import adjusted_mutual_info_score
+      >>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
+      ... # doctest: +SKIP
+      1.0
+      >>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
+      ... # doctest: +SKIP
+      1.0
+
+    If classes members are completely split across different clusters,
+    the assignment is totally in-complete, hence the AMI is null::
+
+      >>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
+      ... # doctest: +SKIP
+      0.0
+
+    References
+    ----------
+    .. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for
+       Clusterings Comparison: Variants, Properties, Normalization and
+       Correction for Chance, JMLR
+       <http://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf>`_
+
+    .. [2] `Wikipedia entry for the Adjusted Mutual Information
+       <https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
+
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+    n_samples = labels_true.shape[0]
+    classes = np.unique(labels_true)
+    clusters = np.unique(labels_pred)
+    # Special limit cases: no clustering since the data is not split.
+    # This is a perfect match hence return 1.0.
+    if (classes.shape[0] == clusters.shape[0] == 1 or
+            classes.shape[0] == clusters.shape[0] == 0):
+        return 1.0
+    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+    contingency = contingency.astype(np.float64,
+                                     **_astype_copy_false(contingency))
+    # Calculate the MI for the two clusterings
+    mi = mutual_info_score(labels_true, labels_pred,
+                           contingency=contingency)
+    # Calculate the expected value for the mutual information
+    emi = expected_mutual_information(contingency, n_samples)
+    # Calculate entropy for each labeling
+    h_true, h_pred = entropy(labels_true), entropy(labels_pred)
+    normalizer = _generalized_average(h_true, h_pred, average_method)
+    denominator = normalizer - emi
+    # Avoid 0.0 / 0.0 when expectation equals maximum, i.e a perfect match.
+    # normalizer should always be >= emi, but because of floating-point
+    # representation, sometimes emi is slightly larger. Correct this
+    # by preserving the sign.
+    if denominator < 0:
+        denominator = min(denominator, -np.finfo('float64').eps)
+    else:
+        denominator = max(denominator, np.finfo('float64').eps)
+    ami = (mi - emi) / denominator
+    return ami
+
+
+@_deprecate_positional_args
+def normalized_mutual_info_score(labels_true, labels_pred, *,
+                                 average_method='arithmetic'):
+    """Normalized Mutual Information between two clusterings.
+
+    Normalized Mutual Information (NMI) is a normalization of the Mutual
+    Information (MI) score to scale the results between 0 (no mutual
+    information) and 1 (perfect correlation). In this function, mutual
+    information is normalized by some generalized mean of ``H(labels_true)``
+    and ``H(labels_pred))``, defined by the `average_method`.
+
+    This measure is not adjusted for chance. Therefore
+    :func:`adjusted_mutual_info_score` might be preferred.
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is furthermore symmetric: switching ``label_true`` with
+    ``label_pred`` will return the same score value. This can be useful to
+    measure the agreement of two independent label assignments strategies
+    on the same dataset when the real ground truth is not known.
+
+    Read more in the :ref:`User Guide <mutual_info_score>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = [n_samples]
+        A clustering of the data into disjoint subsets.
+
+    labels_pred : int array-like of shape (n_samples,)
+        A clustering of the data into disjoint subsets.
+
+    average_method : string, optional (default: 'arithmetic')
+        How to compute the normalizer in the denominator. Possible options
+        are 'min', 'geometric', 'arithmetic', and 'max'.
+
+        .. versionadded:: 0.20
+
+        .. versionchanged:: 0.22
+           The default value of ``average_method`` changed from 'geometric' to
+           'arithmetic'.
+
+    Returns
+    -------
+    nmi : float
+       score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
+
+    See also
+    --------
+    v_measure_score: V-Measure (NMI with arithmetic mean option.)
+    adjusted_rand_score: Adjusted Rand Index
+    adjusted_mutual_info_score: Adjusted Mutual Information (adjusted
+        against chance)
+
+    Examples
+    --------
+
+    Perfect labelings are both homogeneous and complete, hence have
+    score 1.0::
+
+      >>> from sklearn.metrics.cluster import normalized_mutual_info_score
+      >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
+      ... # doctest: +SKIP
+      1.0
+      >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
+      ... # doctest: +SKIP
+      1.0
+
+    If classes members are completely split across different clusters,
+    the assignment is totally in-complete, hence the NMI is null::
+
+      >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
+      ... # doctest: +SKIP
+      0.0
+
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+    classes = np.unique(labels_true)
+    clusters = np.unique(labels_pred)
+    # Special limit cases: no clustering since the data is not split.
+    # This is a perfect match hence return 1.0.
+    if (classes.shape[0] == clusters.shape[0] == 1 or
+            classes.shape[0] == clusters.shape[0] == 0):
+        return 1.0
+    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+    contingency = contingency.astype(np.float64,
+                                     **_astype_copy_false(contingency))
+    # Calculate the MI for the two clusterings
+    mi = mutual_info_score(labels_true, labels_pred,
+                           contingency=contingency)
+    # Calculate the expected value for the mutual information
+    # Calculate entropy for each labeling
+    h_true, h_pred = entropy(labels_true), entropy(labels_pred)
+    normalizer = _generalized_average(h_true, h_pred, average_method)
+    # Avoid 0.0 / 0.0 when either entropy is zero.
+    normalizer = max(normalizer, np.finfo('float64').eps)
+    nmi = mi / normalizer
+    return nmi
+
+
+@_deprecate_positional_args
+def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False):
+    """Measure the similarity of two clusterings of a set of points.
+
+    .. versionadded:: 0.18
+
+    The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of
+    the precision and recall::
+
+        FMI = TP / sqrt((TP + FP) * (TP + FN))
+
+    Where ``TP`` is the number of **True Positive** (i.e. the number of pair of
+    points that belongs in the same clusters in both ``labels_true`` and
+    ``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the
+    number of pair of points that belongs in the same clusters in
+    ``labels_true`` and not in ``labels_pred``) and ``FN`` is the number of
+    **False Negative** (i.e the number of pair of points that belongs in the
+    same clusters in ``labels_pred`` and not in ``labels_True``).
+
+    The score ranges from 0 to 1. A high value indicates a good similarity
+    between two clusters.
+
+    Read more in the :ref:`User Guide <fowlkes_mallows_scores>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = (``n_samples``,)
+        A clustering of the data into disjoint subsets.
+
+    labels_pred : array, shape = (``n_samples``, )
+        A clustering of the data into disjoint subsets.
+
+    sparse : bool
+        Compute contingency matrix internally with sparse matrix.
+
+    Returns
+    -------
+    score : float
+       The resulting Fowlkes-Mallows score.
+
+    Examples
+    --------
+
+    Perfect labelings are both homogeneous and complete, hence have
+    score 1.0::
+
+      >>> from sklearn.metrics.cluster import fowlkes_mallows_score
+      >>> fowlkes_mallows_score([0, 0, 1, 1], [0, 0, 1, 1])
+      1.0
+      >>> fowlkes_mallows_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    If classes members are completely split across different clusters,
+    the assignment is totally random, hence the FMI is null::
+
+      >>> fowlkes_mallows_score([0, 0, 0, 0], [0, 1, 2, 3])
+      0.0
+
+    References
+    ----------
+    .. [1] `E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two
+       hierarchical clusterings". Journal of the American Statistical
+       Association
+       <http://wildfire.stat.ucla.edu/pdflibrary/fowlkes.pdf>`_
+
+    .. [2] `Wikipedia entry for the Fowlkes-Mallows Index
+           <https://en.wikipedia.org/wiki/Fowlkes-Mallows_index>`_
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+    n_samples, = labels_true.shape
+
+    c = contingency_matrix(labels_true, labels_pred,
+                           sparse=True)
+    c = c.astype(np.int64, **_astype_copy_false(c))
+    tk = np.dot(c.data, c.data) - n_samples
+    pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - n_samples
+    qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - n_samples
+    return np.sqrt(tk / pk) * np.sqrt(tk / qk) if tk != 0. else 0.
+
+
+def entropy(labels):
+    """Calculates the entropy for a labeling.
+
+    Parameters
+    ----------
+    labels : int array, shape = [n_samples]
+        The labels
+
+    Notes
+    -----
+    The logarithm used is the natural logarithm (base-e).
+    """
+    if len(labels) == 0:
+        return 1.0
+    label_idx = np.unique(labels, return_inverse=True)[1]
+    pi = np.bincount(label_idx).astype(np.float64)
+    pi = pi[pi > 0]
+    pi_sum = np.sum(pi)
+    # log(a / b) should be calculated as log(a) - log(b) for
+    # possible loss of precision
+    return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/_unsupervised.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/_unsupervised.py
@ -0,0 +1,363 @@
+"""Unsupervised evaluation metrics."""
+
+# Authors: Robert Layton <robertlayton@gmail.com>
+#          Arnaud Fouchet <foucheta@gmail.com>
+#          Thierry Guillemot <thierry.guillemot.work@gmail.com>
+# License: BSD 3 clause
+
+
+import functools
+
+import numpy as np
+
+from ...utils import check_random_state
+from ...utils import check_X_y
+from ...utils import _safe_indexing
+from ..pairwise import pairwise_distances_chunked
+from ..pairwise import pairwise_distances
+from ...preprocessing import LabelEncoder
+from ...utils.validation import _deprecate_positional_args
+
+
+def check_number_of_labels(n_labels, n_samples):
+    """Check that number of labels are valid.
+
+    Parameters
+    ----------
+    n_labels : int
+        Number of labels
+
+    n_samples : int
+        Number of samples
+    """
+    if not 1 < n_labels < n_samples:
+        raise ValueError("Number of labels is %d. Valid values are 2 "
+                         "to n_samples - 1 (inclusive)" % n_labels)
+
+
+@_deprecate_positional_args
+def silhouette_score(X, labels, *, metric='euclidean', sample_size=None,
+                     random_state=None, **kwds):
+    """Compute the mean Silhouette Coefficient of all samples.
+
+    The Silhouette Coefficient is calculated using the mean intra-cluster
+    distance (``a``) and the mean nearest-cluster distance (``b``) for each
+    sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,
+    b)``.  To clarify, ``b`` is the distance between a sample and the nearest
+    cluster that the sample is not a part of.
+    Note that Silhouette Coefficient is only defined if number of labels
+    is 2 <= n_labels <= n_samples - 1.
+
+    This function returns the mean Silhouette Coefficient over all samples.
+    To obtain the values for each sample, use :func:`silhouette_samples`.
+
+    The best value is 1 and the worst value is -1. Values near 0 indicate
+    overlapping clusters. Negative values generally indicate that a sample has
+    been assigned to the wrong cluster, as a different cluster is more similar.
+
+    Read more in the :ref:`User Guide <silhouette_coefficient>`.
+
+    Parameters
+    ----------
+    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
+             [n_samples_a, n_features] otherwise
+        Array of pairwise distances between samples, or a feature array.
+
+    labels : array, shape = [n_samples]
+         Predicted labels for each sample.
+
+    metric : string, or callable
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string, it must be one of the options
+        allowed by :func:`metrics.pairwise.pairwise_distances
+        <sklearn.metrics.pairwise.pairwise_distances>`. If X is the distance
+        array itself, use ``metric="precomputed"``.
+
+    sample_size : int or None
+        The size of the sample to use when computing the Silhouette Coefficient
+        on a random subset of the data.
+        If ``sample_size is None``, no sampling is used.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        Determines random number generation for selecting a subset of samples.
+        Used when ``sample_size is not None``.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    **kwds : optional keyword parameters
+        Any further parameters are passed directly to the distance function.
+        If using a scipy.spatial.distance metric, the parameters are still
+        metric dependent. See the scipy docs for usage examples.
+
+    Returns
+    -------
+    silhouette : float
+        Mean Silhouette Coefficient for all samples.
+
+    References
+    ----------
+
+    .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
+       Interpretation and Validation of Cluster Analysis". Computational
+       and Applied Mathematics 20: 53-65.
+       <https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
+
+    .. [2] `Wikipedia entry on the Silhouette Coefficient
+           <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
+
+    """
+    if sample_size is not None:
+        X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])
+        random_state = check_random_state(random_state)
+        indices = random_state.permutation(X.shape[0])[:sample_size]
+        if metric == "precomputed":
+            X, labels = X[indices].T[indices].T, labels[indices]
+        else:
+            X, labels = X[indices], labels[indices]
+    return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
+
+
+def _silhouette_reduce(D_chunk, start, labels, label_freqs):
+    """Accumulate silhouette statistics for vertical chunk of X
+
+    Parameters
+    ----------
+    D_chunk : shape (n_chunk_samples, n_samples)
+        precomputed distances for a chunk
+    start : int
+        first index in chunk
+    labels : array, shape (n_samples,)
+        corresponding cluster labels, encoded as {0, ..., n_clusters-1}
+    label_freqs : array
+        distribution of cluster labels in ``labels``
+    """
+    # accumulate distances from each sample to each cluster
+    clust_dists = np.zeros((len(D_chunk), len(label_freqs)),
+                           dtype=D_chunk.dtype)
+    for i in range(len(D_chunk)):
+        clust_dists[i] += np.bincount(labels, weights=D_chunk[i],
+                                      minlength=len(label_freqs))
+
+    # intra_index selects intra-cluster distances within clust_dists
+    intra_index = (np.arange(len(D_chunk)), labels[start:start + len(D_chunk)])
+    # intra_clust_dists are averaged over cluster size outside this function
+    intra_clust_dists = clust_dists[intra_index]
+    # of the remaining distances we normalise and extract the minimum
+    clust_dists[intra_index] = np.inf
+    clust_dists /= label_freqs
+    inter_clust_dists = clust_dists.min(axis=1)
+    return intra_clust_dists, inter_clust_dists
+
+
+@_deprecate_positional_args
+def silhouette_samples(X, labels, *, metric='euclidean', **kwds):
+    """Compute the Silhouette Coefficient for each sample.
+
+    The Silhouette Coefficient is a measure of how well samples are clustered
+    with samples that are similar to themselves. Clustering models with a high
+    Silhouette Coefficient are said to be dense, where samples in the same
+    cluster are similar to each other, and well separated, where samples in
+    different clusters are not very similar to each other.
+
+    The Silhouette Coefficient is calculated using the mean intra-cluster
+    distance (``a``) and the mean nearest-cluster distance (``b``) for each
+    sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,
+    b)``.
+    Note that Silhouette Coefficient is only defined if number of labels
+    is 2 <= n_labels <= n_samples - 1.
+
+    This function returns the Silhouette Coefficient for each sample.
+
+    The best value is 1 and the worst value is -1. Values near 0 indicate
+    overlapping clusters.
+
+    Read more in the :ref:`User Guide <silhouette_coefficient>`.
+
+    Parameters
+    ----------
+    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
+             [n_samples_a, n_features] otherwise
+        Array of pairwise distances between samples, or a feature array.
+
+    labels : array, shape = [n_samples]
+             label values for each sample
+
+    metric : string, or callable
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string, it must be one of the options
+        allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If X is
+        the distance array itself, use "precomputed" as the metric. Precomputed
+        distance matrices must have 0 along the diagonal.
+
+    `**kwds` : optional keyword parameters
+        Any further parameters are passed directly to the distance function.
+        If using a ``scipy.spatial.distance`` metric, the parameters are still
+        metric dependent. See the scipy docs for usage examples.
+
+    Returns
+    -------
+    silhouette : array, shape = [n_samples]
+        Silhouette Coefficient for each samples.
+
+    References
+    ----------
+
+    .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
+       Interpretation and Validation of Cluster Analysis". Computational
+       and Applied Mathematics 20: 53-65.
+       <https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
+
+    .. [2] `Wikipedia entry on the Silhouette Coefficient
+       <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
+
+    """
+    X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])
+
+    # Check for non-zero diagonal entries in precomputed distance matrix
+    if metric == 'precomputed':
+        atol = np.finfo(X.dtype).eps * 100
+        if np.any(np.abs(np.diagonal(X)) > atol):
+            raise ValueError(
+                'The precomputed distance matrix contains non-zero '
+                'elements on the diagonal. Use np.fill_diagonal(X, 0).'
+            )
+
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+    n_samples = len(labels)
+    label_freqs = np.bincount(labels)
+    check_number_of_labels(len(le.classes_), n_samples)
+
+    kwds['metric'] = metric
+    reduce_func = functools.partial(_silhouette_reduce,
+                                    labels=labels, label_freqs=label_freqs)
+    results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func,
+                                              **kwds))
+    intra_clust_dists, inter_clust_dists = results
+    intra_clust_dists = np.concatenate(intra_clust_dists)
+    inter_clust_dists = np.concatenate(inter_clust_dists)
+
+    denom = (label_freqs - 1).take(labels, mode='clip')
+    with np.errstate(divide="ignore", invalid="ignore"):
+        intra_clust_dists /= denom
+
+    sil_samples = inter_clust_dists - intra_clust_dists
+    with np.errstate(divide="ignore", invalid="ignore"):
+        sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
+    # nan values are for clusters of size 1, and should be 0
+    return np.nan_to_num(sil_samples)
+
+
+def calinski_harabasz_score(X, labels):
+    """Compute the Calinski and Harabasz score.
+
+    It is also known as the Variance Ratio Criterion.
+
+    The score is defined as ratio between the within-cluster dispersion and
+    the between-cluster dispersion.
+
+    Read more in the :ref:`User Guide <calinski_harabasz_index>`.
+
+    Parameters
+    ----------
+    X : array-like, shape (``n_samples``, ``n_features``)
+        List of ``n_features``-dimensional data points. Each row corresponds
+        to a single data point.
+
+    labels : array-like, shape (``n_samples``,)
+        Predicted labels for each sample.
+
+    Returns
+    -------
+    score : float
+        The resulting Calinski-Harabasz score.
+
+    References
+    ----------
+    .. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster
+       analysis". Communications in Statistics
+       <https://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_
+    """
+    X, labels = check_X_y(X, labels)
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+
+    n_samples, _ = X.shape
+    n_labels = len(le.classes_)
+
+    check_number_of_labels(n_labels, n_samples)
+
+    extra_disp, intra_disp = 0., 0.
+    mean = np.mean(X, axis=0)
+    for k in range(n_labels):
+        cluster_k = X[labels == k]
+        mean_k = np.mean(cluster_k, axis=0)
+        extra_disp += len(cluster_k) * np.sum((mean_k - mean) ** 2)
+        intra_disp += np.sum((cluster_k - mean_k) ** 2)
+
+    return (1. if intra_disp == 0. else
+            extra_disp * (n_samples - n_labels) /
+            (intra_disp * (n_labels - 1.)))
+
+
+def davies_bouldin_score(X, labels):
+    """Computes the Davies-Bouldin score.
+
+    The score is defined as the average similarity measure of each cluster with
+    its most similar cluster, where similarity is the ratio of within-cluster
+    distances to between-cluster distances. Thus, clusters which are farther
+    apart and less dispersed will result in a better score.
+
+    The minimum score is zero, with lower values indicating better clustering.
+
+    Read more in the :ref:`User Guide <davies-bouldin_index>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    X : array-like, shape (``n_samples``, ``n_features``)
+        List of ``n_features``-dimensional data points. Each row corresponds
+        to a single data point.
+
+    labels : array-like, shape (``n_samples``,)
+        Predicted labels for each sample.
+
+    Returns
+    -------
+    score: float
+        The resulting Davies-Bouldin score.
+
+    References
+    ----------
+    .. [1] Davies, David L.; Bouldin, Donald W. (1979).
+       `"A Cluster Separation Measure"
+       <https://ieeexplore.ieee.org/document/4766909>`__.
+       IEEE Transactions on Pattern Analysis and Machine Intelligence.
+       PAMI-1 (2): 224-227
+    """
+    X, labels = check_X_y(X, labels)
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+    n_samples, _ = X.shape
+    n_labels = len(le.classes_)
+    check_number_of_labels(n_labels, n_samples)
+
+    intra_dists = np.zeros(n_labels)
+    centroids = np.zeros((n_labels, len(X[0])), dtype=np.float)
+    for k in range(n_labels):
+        cluster_k = _safe_indexing(X, labels == k)
+        centroid = cluster_k.mean(axis=0)
+        centroids[k] = centroid
+        intra_dists[k] = np.average(pairwise_distances(
+            cluster_k, [centroid]))
+
+    centroid_distances = pairwise_distances(centroids)
+
+    if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):
+        return 0.0
+
+    centroid_distances[centroid_distances == 0] = np.inf
+    combined_intra_dists = intra_dists[:, None] + intra_dists
+    scores = np.max(combined_intra_dists / centroid_distances, axis=1)
+    return np.mean(scores)
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/bicluster.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/bicluster.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _bicluster  # type: ignore
+from ...externals._pep562 import Pep562
+from ...utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.cluster.bicluster'
+correct_import_path = 'sklearn.metrics.cluster'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_bicluster, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/expected_mutual_info_fast.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/expected_mutual_info_fast.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _expected_mutual_info_fast  # type: ignore
+from ...externals._pep562 import Pep562
+from ...utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.cluster.expected_mutual_info_fast'
+correct_import_path = 'sklearn.metrics.cluster'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_expected_mutual_info_fast, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/setup.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/setup.py
@ -0,0 +1,24 @@
+import os
+
+import numpy
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package="", top_path=None):
+    config = Configuration("cluster", parent_package, top_path)
+    libraries = []
+    if os.name == 'posix':
+        libraries.append('m')
+    config.add_extension("_expected_mutual_info_fast",
+                         sources=["_expected_mutual_info_fast.pyx"],
+                         include_dirs=[numpy.get_include()],
+                         libraries=libraries)
+
+    config.add_subpackage("tests")
+
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    setup(**configuration().todict())
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/supervised.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/supervised.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _supervised  # type: ignore
+from ...externals._pep562 import Pep562
+from ...utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.cluster.supervised'
+correct_import_path = 'sklearn.metrics.cluster'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_supervised, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/init.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/init.py
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/test_bicluster.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/test_bicluster.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/test_common.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/test_common.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/test_supervised.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/test_supervised.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/test_unsupervised.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/pycache/test_unsupervised.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/test_bicluster.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/test_bicluster.py
@ -0,0 +1,50 @@
+"""Testing for bicluster metrics module"""
+
+import numpy as np
+
+from sklearn.utils._testing import assert_almost_equal
+
+from sklearn.metrics.cluster._bicluster import _jaccard
+from sklearn.metrics import consensus_score
+
+
+def test_jaccard():
+    a1 = np.array([True, True, False, False])
+    a2 = np.array([True, True, True, True])
+    a3 = np.array([False, True, True, False])
+    a4 = np.array([False, False, True, True])
+
+    assert _jaccard(a1, a1, a1, a1) == 1
+    assert _jaccard(a1, a1, a2, a2) == 0.25
+    assert _jaccard(a1, a1, a3, a3) == 1.0 / 7
+    assert _jaccard(a1, a1, a4, a4) == 0
+
+
+def test_consensus_score():
+    a = [[True, True, False, False],
+         [False, False, True, True]]
+    b = a[::-1]
+
+    assert consensus_score((a, a), (a, a)) == 1
+    assert consensus_score((a, a), (b, b)) == 1
+    assert consensus_score((a, b), (a, b)) == 1
+    assert consensus_score((a, b), (b, a)) == 1
+
+    assert consensus_score((a, a), (b, a)) == 0
+    assert consensus_score((a, a), (a, b)) == 0
+    assert consensus_score((b, b), (a, b)) == 0
+    assert consensus_score((b, b), (b, a)) == 0
+
+
+def test_consensus_score_issue2445():
+    ''' Different number of biclusters in A and B'''
+    a_rows = np.array([[True, True, False, False],
+                       [False, False, True, True],
+                       [False, False, False, True]])
+    a_cols = np.array([[True, True, False, False],
+                       [False, False, True, True],
+                       [False, False, False, True]])
+    idx = [0, 2]
+    s = consensus_score((a_rows, a_cols), (a_rows[idx], a_cols[idx]))
+    # B contains 2 of the 3 biclusters in A, so score should be 2/3
+    assert_almost_equal(s, 2.0/3.0)
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/test_common.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/test_common.py
@ -0,0 +1,211 @@
+from functools import partial
+
+import pytest
+import numpy as np
+
+from sklearn.metrics.cluster import adjusted_mutual_info_score
+from sklearn.metrics.cluster import adjusted_rand_score
+from sklearn.metrics.cluster import completeness_score
+from sklearn.metrics.cluster import fowlkes_mallows_score
+from sklearn.metrics.cluster import homogeneity_score
+from sklearn.metrics.cluster import mutual_info_score
+from sklearn.metrics.cluster import normalized_mutual_info_score
+from sklearn.metrics.cluster import v_measure_score
+from sklearn.metrics.cluster import silhouette_score
+from sklearn.metrics.cluster import calinski_harabasz_score
+from sklearn.metrics.cluster import davies_bouldin_score
+
+from sklearn.utils._testing import assert_allclose
+
+
+# Dictionaries of metrics
+# ------------------------
+# The goal of having those dictionaries is to have an easy way to call a
+# particular metric and associate a name to each function:
+#   - SUPERVISED_METRICS: all supervised cluster metrics - (when given a
+# ground truth value)
+#   - UNSUPERVISED_METRICS: all unsupervised cluster metrics
+#
+# Those dictionaries will be used to test systematically some invariance
+# properties, e.g. invariance toward several input layout.
+#
+
+SUPERVISED_METRICS = {
+    "adjusted_mutual_info_score": adjusted_mutual_info_score,
+    "adjusted_rand_score": adjusted_rand_score,
+    "completeness_score": completeness_score,
+    "homogeneity_score": homogeneity_score,
+    "mutual_info_score": mutual_info_score,
+    "normalized_mutual_info_score": normalized_mutual_info_score,
+    "v_measure_score": v_measure_score,
+    "fowlkes_mallows_score": fowlkes_mallows_score
+}
+
+UNSUPERVISED_METRICS = {
+    "silhouette_score": silhouette_score,
+    "silhouette_manhattan": partial(silhouette_score, metric='manhattan'),
+    "calinski_harabasz_score": calinski_harabasz_score,
+    "davies_bouldin_score": davies_bouldin_score
+}
+
+# Lists of metrics with common properties
+# ---------------------------------------
+# Lists of metrics with common properties are used to test systematically some
+# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics
+# that are symmetric with respect to their input argument y_true and y_pred.
+#
+# --------------------------------------------------------------------
+# Symmetric with respect to their input arguments y_true and y_pred.
+# Symmetric metrics only apply to supervised clusters.
+SYMMETRIC_METRICS = [
+    "adjusted_rand_score", "v_measure_score",
+    "mutual_info_score", "adjusted_mutual_info_score",
+    "normalized_mutual_info_score", "fowlkes_mallows_score"
+]
+
+NON_SYMMETRIC_METRICS = ["homogeneity_score", "completeness_score"]
+
+# Metrics whose upper bound is 1
+NORMALIZED_METRICS = [
+    "adjusted_rand_score", "homogeneity_score", "completeness_score",
+    "v_measure_score", "adjusted_mutual_info_score", "fowlkes_mallows_score",
+    "normalized_mutual_info_score"
+]
+
+
+rng = np.random.RandomState(0)
+y1 = rng.randint(3, size=30)
+y2 = rng.randint(3, size=30)
+
+
+def test_symmetric_non_symmetric_union():
+    assert (sorted(SYMMETRIC_METRICS + NON_SYMMETRIC_METRICS) ==
+            sorted(SUPERVISED_METRICS))
+
+
+# 0.22 AMI and NMI changes
+@pytest.mark.filterwarnings('ignore::FutureWarning')
+@pytest.mark.parametrize(
+    'metric_name, y1, y2',
+    [(name, y1, y2) for name in SYMMETRIC_METRICS]
+)
+def test_symmetry(metric_name, y1, y2):
+    metric = SUPERVISED_METRICS[metric_name]
+    assert metric(y1, y2) == pytest.approx(metric(y2, y1))
+
+
+@pytest.mark.parametrize(
+    'metric_name, y1, y2',
+    [(name, y1, y2) for name in NON_SYMMETRIC_METRICS]
+)
+def test_non_symmetry(metric_name, y1, y2):
+    metric = SUPERVISED_METRICS[metric_name]
+    assert metric(y1, y2) != pytest.approx(metric(y2, y1))
+
+
+# 0.22 AMI and NMI changes
+@pytest.mark.filterwarnings('ignore::FutureWarning')
+@pytest.mark.parametrize("metric_name", NORMALIZED_METRICS)
+def test_normalized_output(metric_name):
+    upper_bound_1 = [0, 0, 0, 1, 1, 1]
+    upper_bound_2 = [0, 0, 0, 1, 1, 1]
+    metric = SUPERVISED_METRICS[metric_name]
+    assert metric([0, 0, 0, 1, 1], [0, 0, 0, 1, 2]) > 0.0
+    assert metric([0, 0, 1, 1, 2], [0, 0, 1, 1, 1]) > 0.0
+    assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
+    assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
+    assert metric(upper_bound_1, upper_bound_2) == pytest.approx(1.0)
+
+    lower_bound_1 = [0, 0, 0, 0, 0, 0]
+    lower_bound_2 = [0, 1, 2, 3, 4, 5]
+    score = np.array([metric(lower_bound_1, lower_bound_2),
+                      metric(lower_bound_2, lower_bound_1)])
+    assert not (score < 0).any()
+
+
+# 0.22 AMI and NMI changes
+@pytest.mark.filterwarnings('ignore::FutureWarning')
+@pytest.mark.parametrize(
+    "metric_name", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)
+)
+def test_permute_labels(metric_name):
+    # All clustering metrics do not change score due to permutations of labels
+    # that is when 0 and 1 exchanged.
+    y_label = np.array([0, 0, 0, 1, 1, 0, 1])
+    y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
+    if metric_name in SUPERVISED_METRICS:
+        metric = SUPERVISED_METRICS[metric_name]
+        score_1 = metric(y_pred, y_label)
+        assert_allclose(score_1, metric(1 - y_pred, y_label))
+        assert_allclose(score_1, metric(1 - y_pred, 1 - y_label))
+        assert_allclose(score_1, metric(y_pred, 1 - y_label))
+    else:
+        metric = UNSUPERVISED_METRICS[metric_name]
+        X = np.random.randint(10, size=(7, 10))
+        score_1 = metric(X, y_pred)
+        assert_allclose(score_1, metric(X, 1 - y_pred))
+
+
+# 0.22 AMI and NMI changes
+@pytest.mark.filterwarnings('ignore::FutureWarning')
+@pytest.mark.parametrize(
+    "metric_name", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)
+)
+# For all clustering metrics Input parameters can be both
+# in the form of arrays lists, positive, negative or string
+def test_format_invariance(metric_name):
+    y_true = [0, 0, 0, 0, 1, 1, 1, 1]
+    y_pred = [0, 1, 2, 3, 4, 5, 6, 7]
+
+    def generate_formats(y):
+        y = np.array(y)
+        yield y, 'array of ints'
+        yield y.tolist(), 'list of ints'
+        yield [str(x) + "-a" for x in y.tolist()], 'list of strs'
+        yield (np.array([str(x) + "-a" for x in y.tolist()], dtype=object),
+               'array of strs')
+        yield y - 1, 'including negative ints'
+        yield y + 1, 'strictly positive ints'
+
+    if metric_name in SUPERVISED_METRICS:
+        metric = SUPERVISED_METRICS[metric_name]
+        score_1 = metric(y_true, y_pred)
+        y_true_gen = generate_formats(y_true)
+        y_pred_gen = generate_formats(y_pred)
+        for (y_true_fmt, fmt_name), (y_pred_fmt, _) in zip(y_true_gen,
+                                                           y_pred_gen):
+            assert score_1 == metric(y_true_fmt, y_pred_fmt)
+    else:
+        metric = UNSUPERVISED_METRICS[metric_name]
+        X = np.random.randint(10, size=(8, 10))
+        score_1 = metric(X, y_true)
+        assert score_1 == metric(X.astype(float), y_true)
+        y_true_gen = generate_formats(y_true)
+        for (y_true_fmt, fmt_name) in y_true_gen:
+            assert score_1 == metric(X, y_true_fmt)
+
+
+@pytest.mark.parametrize("metric", SUPERVISED_METRICS.values())
+def test_single_sample(metric):
+    # only the supervised metrics support single sample
+    for i, j in [(0, 0), (0, 1), (1, 0), (1, 1)]:
+        metric([i], [j])
+
+
+@pytest.mark.parametrize(
+    "metric_name, metric_func",
+    dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS).items()
+)
+def test_inf_nan_input(metric_name, metric_func):
+    if metric_name in SUPERVISED_METRICS:
+        invalids = [([0, 1], [np.inf, np.inf]),
+                    ([0, 1], [np.nan, np.nan]),
+                    ([0, 1], [np.nan, np.inf])]
+    else:
+        X = np.random.randint(10, size=(2, 10))
+        invalids = [(X, [np.inf, np.inf]),
+                    (X, [np.nan, np.nan]),
+                    (X, [np.nan, np.inf])]
+    with pytest.raises(ValueError, match='contains NaN, infinity'):
+        for args in invalids:
+            metric_func(*args)
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/test_supervised.py
@ -0,0 +1,358 @@
+import numpy as np
+import pytest
+
+from sklearn.metrics.cluster import adjusted_mutual_info_score
+from sklearn.metrics.cluster import adjusted_rand_score
+from sklearn.metrics.cluster import completeness_score
+from sklearn.metrics.cluster import contingency_matrix
+from sklearn.metrics.cluster import entropy
+from sklearn.metrics.cluster import expected_mutual_information
+from sklearn.metrics.cluster import fowlkes_mallows_score
+from sklearn.metrics.cluster import homogeneity_completeness_v_measure
+from sklearn.metrics.cluster import homogeneity_score
+from sklearn.metrics.cluster import mutual_info_score
+from sklearn.metrics.cluster import normalized_mutual_info_score
+from sklearn.metrics.cluster import v_measure_score
+from sklearn.metrics.cluster._supervised import _generalized_average
+
+from sklearn.utils import assert_all_finite
+from sklearn.utils._testing import (
+        assert_almost_equal, ignore_warnings)
+from numpy.testing import assert_array_almost_equal
+
+
+score_funcs = [
+    adjusted_rand_score,
+    homogeneity_score,
+    completeness_score,
+    v_measure_score,
+    adjusted_mutual_info_score,
+    normalized_mutual_info_score,
+]
+
+
+@ignore_warnings(category=FutureWarning)
+def test_error_messages_on_wrong_input():
+    for score_func in score_funcs:
+        expected = (r'Found input variables with inconsistent numbers '
+                    r'of samples: \[2, 3\]')
+        with pytest.raises(ValueError, match=expected):
+            score_func([0, 1], [1, 1, 1])
+
+        expected = r"labels_true must be 1D: shape is \(2"
+        with pytest.raises(ValueError, match=expected):
+            score_func([[0, 1], [1, 0]], [1, 1, 1])
+
+        expected = r"labels_pred must be 1D: shape is \(2"
+        with pytest.raises(ValueError, match=expected):
+            score_func([0, 1, 0], [[1, 1], [0, 0]])
+
+
+def test_generalized_average():
+    a, b = 1, 2
+    methods = ["min", "geometric", "arithmetic", "max"]
+    means = [_generalized_average(a, b, method) for method in methods]
+    assert means[0] <= means[1] <= means[2] <= means[3]
+    c, d = 12, 12
+    means = [_generalized_average(c, d, method) for method in methods]
+    assert means[0] == means[1] == means[2] == means[3]
+
+
+@ignore_warnings(category=FutureWarning)
+def test_perfect_matches():
+    for score_func in score_funcs:
+        assert score_func([], []) == pytest.approx(1.0)
+        assert score_func([0], [1]) == pytest.approx(1.0)
+        assert score_func([0, 0, 0], [0, 0, 0]) == pytest.approx(1.0)
+        assert score_func([0, 1, 0], [42, 7, 42]) == pytest.approx(1.0)
+        assert score_func([0., 1., 0.], [42., 7., 42.]) == pytest.approx(1.0)
+        assert score_func([0., 1., 2.], [42., 7., 2.]) == pytest.approx(1.0)
+        assert score_func([0, 1, 2], [42, 7, 2]) == pytest.approx(1.0)
+    score_funcs_with_changing_means = [
+        normalized_mutual_info_score,
+        adjusted_mutual_info_score,
+    ]
+    means = {"min", "geometric", "arithmetic", "max"}
+    for score_func in score_funcs_with_changing_means:
+        for mean in means:
+            assert score_func([], [], mean) == pytest.approx(1.0)
+            assert score_func([0], [1], mean) == pytest.approx(1.0)
+            assert score_func([0, 0, 0], [0, 0, 0], mean) == pytest.approx(1.0)
+            assert score_func(
+                [0, 1, 0], [42, 7, 42], mean) == pytest.approx(1.0)
+            assert score_func(
+                [0., 1., 0.], [42., 7., 42.], mean) == pytest.approx(1.0)
+            assert score_func(
+                [0., 1., 2.], [42., 7., 2.], mean) == pytest.approx(1.0)
+            assert score_func(
+                [0, 1, 2], [42, 7, 2], mean) == pytest.approx(1.0)
+
+
+def test_homogeneous_but_not_complete_labeling():
+    # homogeneous but not complete clustering
+    h, c, v = homogeneity_completeness_v_measure(
+        [0, 0, 0, 1, 1, 1],
+        [0, 0, 0, 1, 2, 2])
+    assert_almost_equal(h, 1.00, 2)
+    assert_almost_equal(c, 0.69, 2)
+    assert_almost_equal(v, 0.81, 2)
+
+
+def test_complete_but_not_homogeneous_labeling():
+    # complete but not homogeneous clustering
+    h, c, v = homogeneity_completeness_v_measure(
+        [0, 0, 1, 1, 2, 2],
+        [0, 0, 1, 1, 1, 1])
+    assert_almost_equal(h, 0.58, 2)
+    assert_almost_equal(c, 1.00, 2)
+    assert_almost_equal(v, 0.73, 2)
+
+
+def test_not_complete_and_not_homogeneous_labeling():
+    # neither complete nor homogeneous but not so bad either
+    h, c, v = homogeneity_completeness_v_measure(
+        [0, 0, 0, 1, 1, 1],
+        [0, 1, 0, 1, 2, 2])
+    assert_almost_equal(h, 0.67, 2)
+    assert_almost_equal(c, 0.42, 2)
+    assert_almost_equal(v, 0.52, 2)
+
+
+def test_beta_parameter():
+    # test for when beta passed to
+    # homogeneity_completeness_v_measure
+    # and v_measure_score
+    beta_test = 0.2
+    h_test = 0.67
+    c_test = 0.42
+    v_test = ((1 + beta_test) * h_test * c_test
+              / (beta_test * h_test + c_test))
+
+    h, c, v = homogeneity_completeness_v_measure(
+        [0, 0, 0, 1, 1, 1],
+        [0, 1, 0, 1, 2, 2],
+        beta=beta_test)
+    assert_almost_equal(h, h_test, 2)
+    assert_almost_equal(c, c_test, 2)
+    assert_almost_equal(v, v_test, 2)
+
+    v = v_measure_score(
+        [0, 0, 0, 1, 1, 1],
+        [0, 1, 0, 1, 2, 2],
+        beta=beta_test)
+    assert_almost_equal(v, v_test, 2)
+
+
+def test_non_consecutive_labels():
+    # regression tests for labels with gaps
+    h, c, v = homogeneity_completeness_v_measure(
+        [0, 0, 0, 2, 2, 2],
+        [0, 1, 0, 1, 2, 2])
+    assert_almost_equal(h, 0.67, 2)
+    assert_almost_equal(c, 0.42, 2)
+    assert_almost_equal(v, 0.52, 2)
+
+    h, c, v = homogeneity_completeness_v_measure(
+        [0, 0, 0, 1, 1, 1],
+        [0, 4, 0, 4, 2, 2])
+    assert_almost_equal(h, 0.67, 2)
+    assert_almost_equal(c, 0.42, 2)
+    assert_almost_equal(v, 0.52, 2)
+
+    ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
+    ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
+    assert_almost_equal(ari_1, 0.24, 2)
+    assert_almost_equal(ari_2, 0.24, 2)
+
+
+@ignore_warnings(category=FutureWarning)
+def uniform_labelings_scores(score_func, n_samples, k_range, n_runs=10,
+                             seed=42):
+    # Compute score for random uniform cluster labelings
+    random_labels = np.random.RandomState(seed).randint
+    scores = np.zeros((len(k_range), n_runs))
+    for i, k in enumerate(k_range):
+        for j in range(n_runs):
+            labels_a = random_labels(low=0, high=k, size=n_samples)
+            labels_b = random_labels(low=0, high=k, size=n_samples)
+            scores[i, j] = score_func(labels_a, labels_b)
+    return scores
+
+
+@ignore_warnings(category=FutureWarning)
+def test_adjustment_for_chance():
+    # Check that adjusted scores are almost zero on random labels
+    n_clusters_range = [2, 10, 50, 90]
+    n_samples = 100
+    n_runs = 10
+
+    scores = uniform_labelings_scores(
+        adjusted_rand_score, n_samples, n_clusters_range, n_runs)
+
+    max_abs_scores = np.abs(scores).max(axis=1)
+    assert_array_almost_equal(max_abs_scores, [0.02, 0.03, 0.03, 0.02], 2)
+
+
+def test_adjusted_mutual_info_score():
+    # Compute the Adjusted Mutual Information and test against known values
+    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
+    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
+    # Mutual information
+    mi = mutual_info_score(labels_a, labels_b)
+    assert_almost_equal(mi, 0.41022, 5)
+    # with provided sparse contingency
+    C = contingency_matrix(labels_a, labels_b, sparse=True)
+    mi = mutual_info_score(labels_a, labels_b, contingency=C)
+    assert_almost_equal(mi, 0.41022, 5)
+    # with provided dense contingency
+    C = contingency_matrix(labels_a, labels_b)
+    mi = mutual_info_score(labels_a, labels_b, contingency=C)
+    assert_almost_equal(mi, 0.41022, 5)
+    # Expected mutual information
+    n_samples = C.sum()
+    emi = expected_mutual_information(C, n_samples)
+    assert_almost_equal(emi, 0.15042, 5)
+    # Adjusted mutual information
+    ami = adjusted_mutual_info_score(labels_a, labels_b)
+    assert_almost_equal(ami, 0.27821, 5)
+    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
+    assert ami == pytest.approx(1.0)
+    # Test with a very large array
+    a110 = np.array([list(labels_a) * 110]).flatten()
+    b110 = np.array([list(labels_b) * 110]).flatten()
+    ami = adjusted_mutual_info_score(a110, b110)
+    assert_almost_equal(ami, 0.38, 2)
+
+
+def test_expected_mutual_info_overflow():
+    # Test for regression where contingency cell exceeds 2**16
+    # leading to overflow in np.outer, resulting in EMI > 1
+    assert expected_mutual_information(np.array([[70000]]), 70000) <= 1
+
+
+def test_int_overflow_mutual_info_fowlkes_mallows_score():
+    # Test overflow in mutual_info_classif and fowlkes_mallows_score
+    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 +
+                 204) + [4] * (814 + 39) + [5] * (316 + 20))
+    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
+                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
+                 [1] * 20)
+
+    assert_all_finite(mutual_info_score(x, y))
+    assert_all_finite(fowlkes_mallows_score(x, y))
+
+
+def test_entropy():
+    ent = entropy([0, 0, 42.])
+    assert_almost_equal(ent, 0.6365141, 5)
+    assert_almost_equal(entropy([]), 1)
+
+
+def test_contingency_matrix():
+    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
+    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
+    C = contingency_matrix(labels_a, labels_b)
+    C2 = np.histogram2d(labels_a, labels_b,
+                        bins=(np.arange(1, 5),
+                              np.arange(1, 5)))[0]
+    assert_array_almost_equal(C, C2)
+    C = contingency_matrix(labels_a, labels_b, eps=.1)
+    assert_array_almost_equal(C, C2 + .1)
+
+
+def test_contingency_matrix_sparse():
+    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
+    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
+    C = contingency_matrix(labels_a, labels_b)
+    C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray()
+    assert_array_almost_equal(C, C_sparse)
+    with pytest.raises(ValueError, match="Cannot set 'eps' when sparse=True"):
+        contingency_matrix(labels_a, labels_b, eps=1e-10, sparse=True)
+
+
+@ignore_warnings(category=FutureWarning)
+def test_exactly_zero_info_score():
+    # Check numerical stability when information is exactly zero
+    for i in np.logspace(1, 4, 4).astype(np.int):
+        labels_a, labels_b = (np.ones(i, dtype=np.int),
+                              np.arange(i, dtype=np.int))
+        assert normalized_mutual_info_score(
+            labels_a, labels_b) == pytest.approx(0.0)
+        assert v_measure_score(
+            labels_a, labels_b) == pytest.approx(0.0)
+        assert adjusted_mutual_info_score(
+            labels_a, labels_b) == pytest.approx(0.0)
+        assert normalized_mutual_info_score(
+            labels_a, labels_b) == pytest.approx(0.0)
+        for method in ["min", "geometric", "arithmetic", "max"]:
+            assert adjusted_mutual_info_score(
+                labels_a, labels_b,  method) == pytest.approx(0.0)
+            assert normalized_mutual_info_score(
+                labels_a, labels_b, method) == pytest.approx(0.0)
+
+
+def test_v_measure_and_mutual_information(seed=36):
+    # Check relation between v_measure, entropy and mutual information
+    for i in np.logspace(1, 4, 4).astype(np.int):
+        random_state = np.random.RandomState(seed)
+        labels_a, labels_b = (random_state.randint(0, 10, i),
+                              random_state.randint(0, 10, i))
+        assert_almost_equal(v_measure_score(labels_a, labels_b),
+                            2.0 * mutual_info_score(labels_a, labels_b) /
+                            (entropy(labels_a) + entropy(labels_b)), 0)
+        avg = 'arithmetic'
+        assert_almost_equal(v_measure_score(labels_a, labels_b),
+                            normalized_mutual_info_score(labels_a, labels_b,
+                                                         average_method=avg)
+                            )
+
+
+def test_fowlkes_mallows_score():
+    # General case
+    score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1],
+                                  [0, 0, 1, 1, 2, 2])
+    assert_almost_equal(score, 4. / np.sqrt(12. * 6.))
+
+    # Perfect match but where the label names changed
+    perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1],
+                                          [1, 1, 1, 0, 0, 0])
+    assert_almost_equal(perfect_score, 1.)
+
+    # Worst case
+    worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0],
+                                        [0, 1, 2, 3, 4, 5])
+    assert_almost_equal(worst_score, 0.)
+
+
+def test_fowlkes_mallows_score_properties():
+    # handcrafted example
+    labels_a = np.array([0, 0, 0, 1, 1, 2])
+    labels_b = np.array([1, 1, 2, 2, 0, 0])
+    expected = 1. / np.sqrt((1. + 3.) * (1. + 2.))
+    # FMI = TP / sqrt((TP + FP) * (TP + FN))
+
+    score_original = fowlkes_mallows_score(labels_a, labels_b)
+    assert_almost_equal(score_original, expected)
+
+    # symmetric property
+    score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
+    assert_almost_equal(score_symmetric, expected)
+
+    # permutation property
+    score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
+    assert_almost_equal(score_permuted, expected)
+
+    # symmetric and permutation(both together)
+    score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
+    assert_almost_equal(score_both, expected)
+
+
+@pytest.mark.parametrize('labels_true, labels_pred', [
+    (['a'] * 6, [1, 1, 0, 0, 1, 1]),
+    ([1] * 6, [1, 1, 0, 0, 1, 1]),
+    ([1, 1, 0, 0, 1, 1], ['a'] * 6),
+    ([1, 1, 0, 0, 1, 1], [1] * 6),
+])
+def test_mutual_info_score_positive_constant_label(labels_true, labels_pred):
+    # non-regression test for #16355
+    assert mutual_info_score(labels_true, labels_pred) >= 0
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py
@ -0,0 +1,252 @@
+import numpy as np
+import scipy.sparse as sp
+import pytest
+from scipy.sparse import csr_matrix
+
+from sklearn import datasets
+from sklearn.utils._testing import assert_array_equal
+from sklearn.metrics.cluster import silhouette_score
+from sklearn.metrics.cluster import silhouette_samples
+from sklearn.metrics import pairwise_distances
+from sklearn.metrics.cluster import calinski_harabasz_score
+from sklearn.metrics.cluster import davies_bouldin_score
+
+
+def test_silhouette():
+    # Tests the Silhouette Coefficient.
+    dataset = datasets.load_iris()
+    X_dense = dataset.data
+    X_csr = csr_matrix(X_dense)
+    X_dok = sp.dok_matrix(X_dense)
+    X_lil = sp.lil_matrix(X_dense)
+    y = dataset.target
+
+    for X in [X_dense, X_csr, X_dok, X_lil]:
+        D = pairwise_distances(X, metric='euclidean')
+        # Given that the actual labels are used, we can assume that S would be
+        # positive.
+        score_precomputed = silhouette_score(D, y, metric='precomputed')
+        assert score_precomputed > 0
+        # Test without calculating D
+        score_euclidean = silhouette_score(X, y, metric='euclidean')
+        pytest.approx(score_precomputed, score_euclidean)
+
+        if X is X_dense:
+            score_dense_without_sampling = score_precomputed
+        else:
+            pytest.approx(score_euclidean,
+                          score_dense_without_sampling)
+
+        # Test with sampling
+        score_precomputed = silhouette_score(D, y, metric='precomputed',
+                                             sample_size=int(X.shape[0] / 2),
+                                             random_state=0)
+        score_euclidean = silhouette_score(X, y, metric='euclidean',
+                                           sample_size=int(X.shape[0] / 2),
+                                           random_state=0)
+        assert score_precomputed > 0
+        assert score_euclidean > 0
+        pytest.approx(score_euclidean, score_precomputed)
+
+        if X is X_dense:
+            score_dense_with_sampling = score_precomputed
+        else:
+            pytest.approx(score_euclidean, score_dense_with_sampling)
+
+
+def test_cluster_size_1():
+    # Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
+    # (cluster 0). We also test the case where there are identical samples
+    # as the only members of a cluster (cluster 2). To our knowledge, this case
+    # is not discussed in reference material, and we choose for it a sample
+    # score of 1.
+    X = [[0.], [1.], [1.], [2.], [3.], [3.]]
+    labels = np.array([0, 1, 1, 1, 2, 2])
+
+    # Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
+    # Cluster 1: intra-cluster = [.5, .5, 1]
+    #            inter-cluster = [1, 1, 1]
+    #            silhouette    = [.5, .5, 0]
+    # Cluster 2: intra-cluster = [0, 0]
+    #            inter-cluster = [arbitrary, arbitrary]
+    #            silhouette    = [1., 1.]
+
+    silhouette = silhouette_score(X, labels)
+    assert not np.isnan(silhouette)
+    ss = silhouette_samples(X, labels)
+    assert_array_equal(ss, [0, .5, .5, 0, 1, 1])
+
+
+def test_silhouette_paper_example():
+    # Explicitly check per-sample results against Rousseeuw (1987)
+    # Data from Table 1
+    lower = [5.58,
+             7.00, 6.50,
+             7.08, 7.00, 3.83,
+             4.83, 5.08, 8.17, 5.83,
+             2.17, 5.75, 6.67, 6.92, 4.92,
+             6.42, 5.00, 5.58, 6.00, 4.67, 6.42,
+             3.42, 5.50, 6.42, 6.42, 5.00, 3.92, 6.17,
+             2.50, 4.92, 6.25, 7.33, 4.50, 2.25, 6.33, 2.75,
+             6.08, 6.67, 4.25, 2.67, 6.00, 6.17, 6.17, 6.92, 6.17,
+             5.25, 6.83, 4.50, 3.75, 5.75, 5.42, 6.08, 5.83, 6.67, 3.67,
+             4.75, 3.00, 6.08, 6.67, 5.00, 5.58, 4.83, 6.17, 5.67, 6.50, 6.92]
+    D = np.zeros((12, 12))
+    D[np.tril_indices(12, -1)] = lower
+    D += D.T
+
+    names = ['BEL', 'BRA', 'CHI', 'CUB', 'EGY', 'FRA', 'IND', 'ISR', 'USA',
+             'USS', 'YUG', 'ZAI']
+
+    # Data from Figure 2
+    labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1]
+    expected1 = {'USA': .43, 'BEL': .39, 'FRA': .35, 'ISR': .30, 'BRA': .22,
+                 'EGY': .20, 'ZAI': .19, 'CUB': .40, 'USS': .34, 'CHI': .33,
+                 'YUG': .26, 'IND': -.04}
+    score1 = .28
+
+    # Data from Figure 3
+    labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2]
+    expected2 = {'USA': .47, 'FRA': .44, 'BEL': .42, 'ISR': .37, 'EGY': .02,
+                 'ZAI': .28, 'BRA': .25, 'IND': .17, 'CUB': .48, 'USS': .44,
+                 'YUG': .31, 'CHI': .31}
+    score2 = .33
+
+    for labels, expected, score in [(labels1, expected1, score1),
+                                    (labels2, expected2, score2)]:
+        expected = [expected[name] for name in names]
+        # we check to 2dp because that's what's in the paper
+        pytest.approx(expected,
+                      silhouette_samples(D, np.array(labels),
+                                         metric='precomputed'),
+                      abs=1e-2)
+        pytest.approx(score,
+                      silhouette_score(D, np.array(labels),
+                                       metric='precomputed'),
+                      abs=1e-2)
+
+
+def test_correct_labelsize():
+    # Assert 1 < n_labels < n_samples
+    dataset = datasets.load_iris()
+    X = dataset.data
+
+    # n_labels = n_samples
+    y = np.arange(X.shape[0])
+    err_msg = (r'Number of labels is %d\. Valid values are 2 '
+               r'to n_samples - 1 \(inclusive\)' % len(np.unique(y)))
+    with pytest.raises(ValueError, match=err_msg):
+        silhouette_score(X, y)
+
+    # n_labels = 1
+    y = np.zeros(X.shape[0])
+    err_msg = (r'Number of labels is %d\. Valid values are 2 '
+               r'to n_samples - 1 \(inclusive\)' % len(np.unique(y)))
+    with pytest.raises(ValueError, match=err_msg):
+        silhouette_score(X, y)
+
+
+def test_non_encoded_labels():
+    dataset = datasets.load_iris()
+    X = dataset.data
+    labels = dataset.target
+    assert (
+        silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels))
+    assert_array_equal(
+        silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))
+
+
+def test_non_numpy_labels():
+    dataset = datasets.load_iris()
+    X = dataset.data
+    y = dataset.target
+    assert (
+        silhouette_score(list(X), list(y)) == silhouette_score(X, y))
+
+
+@pytest.mark.parametrize('dtype', (np.float32, np.float64))
+def test_silhouette_nonzero_diag(dtype):
+    # Make sure silhouette_samples requires diagonal to be zero.
+    # Non-regression test for #12178
+
+    # Construct a zero-diagonal matrix
+    dists = pairwise_distances(
+        np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]], dtype=dtype).T)
+    labels = [0, 0, 0, 1, 1, 1]
+
+    # small values on the diagonal are OK
+    dists[2][2] = np.finfo(dists.dtype).eps * 10
+    silhouette_samples(dists, labels, metric='precomputed')
+
+    # values bigger than eps * 100 are not
+    dists[2][2] = np.finfo(dists.dtype).eps * 1000
+    with pytest.raises(ValueError, match='contains non-zero'):
+        silhouette_samples(dists, labels, metric='precomputed')
+
+
+def assert_raises_on_only_one_label(func):
+    """Assert message when there is only one label"""
+    rng = np.random.RandomState(seed=0)
+    with pytest.raises(ValueError, match="Number of labels is"):
+        func(rng.rand(10, 2), np.zeros(10))
+
+
+def assert_raises_on_all_points_same_cluster(func):
+    """Assert message when all point are in different clusters"""
+    rng = np.random.RandomState(seed=0)
+    with pytest.raises(ValueError, match="Number of labels is"):
+        func(rng.rand(10, 2), np.arange(10))
+
+
+def test_calinski_harabasz_score():
+    assert_raises_on_only_one_label(calinski_harabasz_score)
+
+    assert_raises_on_all_points_same_cluster(calinski_harabasz_score)
+
+    # Assert the value is 1. when all samples are equals
+    assert 1. == calinski_harabasz_score(np.ones((10, 2)),
+                                         [0] * 5 + [1] * 5)
+
+    # Assert the value is 0. when all the mean cluster are equal
+    assert 0. == calinski_harabasz_score([[-1, -1], [1, 1]] * 10,
+                                         [0] * 10 + [1] * 10)
+
+    # General case (with non numpy arrays)
+    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
+         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
+    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
+    pytest.approx(calinski_harabasz_score(X, labels),
+                  45 * (40 - 4) / (5 * (4 - 1)))
+
+
+def test_davies_bouldin_score():
+    assert_raises_on_only_one_label(davies_bouldin_score)
+    assert_raises_on_all_points_same_cluster(davies_bouldin_score)
+
+    # Assert the value is 0. when all samples are equals
+    assert davies_bouldin_score(np.ones((10, 2)),
+                                [0] * 5 + [1] * 5) == pytest.approx(0.0)
+
+    # Assert the value is 0. when all the mean cluster are equal
+    assert davies_bouldin_score([[-1, -1], [1, 1]] * 10,
+                                [0] * 10 + [1] * 10) == pytest.approx(0.0)
+
+    # General case (with non numpy arrays)
+    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
+         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
+    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
+    pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)
+
+    # Ensure divide by zero warning is not raised in general case
+    with pytest.warns(None) as record:
+        davies_bouldin_score(X, labels)
+    div_zero_warnings = [
+        warning for warning in record
+        if "divide by zero encountered" in warning.message.args[0]
+    ]
+    assert len(div_zero_warnings) == 0
+
+    # General case - cluster have one sample
+    X = ([[0, 0], [2, 2], [3, 3], [5, 5]])
+    labels = [0, 0, 1, 2]
+    pytest.approx(davies_bouldin_score(X, labels), (5. / 4) / 3)
--- a/venv/Lib/site-packages/sklearn/metrics/cluster/unsupervised.py
+++ b/venv/Lib/site-packages/sklearn/metrics/cluster/unsupervised.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _unsupervised  # type: ignore
+from ...externals._pep562 import Pep562
+from ...utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.cluster.unsupervised'
+correct_import_path = 'sklearn.metrics.cluster'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_unsupervised, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/pairwise.py
+++ b/venv/Lib/site-packages/sklearn/metrics/pairwise.py
--- a/venv/Lib/site-packages/sklearn/metrics/pairwise_fast.py
+++ b/venv/Lib/site-packages/sklearn/metrics/pairwise_fast.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _pairwise_fast  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.pairwise_fast'
+correct_import_path = 'sklearn.metrics'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_pairwise_fast, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/ranking.py
+++ b/venv/Lib/site-packages/sklearn/metrics/ranking.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _ranking  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.ranking'
+correct_import_path = 'sklearn.metrics'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_ranking, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/regression.py
+++ b/venv/Lib/site-packages/sklearn/metrics/regression.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _regression  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.regression'
+correct_import_path = 'sklearn.metrics'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_regression, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/scorer.py
+++ b/venv/Lib/site-packages/sklearn/metrics/scorer.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _scorer  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.metrics.scorer'
+correct_import_path = 'sklearn.metrics'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_scorer, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/metrics/setup.py
+++ b/venv/Lib/site-packages/sklearn/metrics/setup.py
@ -0,0 +1,28 @@
+import os
+
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package="", top_path=None):
+    config = Configuration("metrics", parent_package, top_path)
+
+    libraries = []
+    if os.name == 'posix':
+        libraries.append('m')
+
+    config.add_subpackage('_plot')
+    config.add_subpackage('_plot.tests')
+    config.add_subpackage('cluster')
+
+    config.add_extension("_pairwise_fast",
+                         sources=["_pairwise_fast.pyx"],
+                         libraries=libraries)
+
+    config.add_subpackage('tests')
+
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    setup(**configuration().todict())
--- a/venv/Lib/site-packages/sklearn/metrics/tests/init.py
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/init.py
--- a/venv/Lib/site-packages/sklearn/metrics/tests/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_classification.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_classification.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_common.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_common.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_pairwise.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_pairwise.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_ranking.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_ranking.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_regression.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_regression.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_score_objects.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/pycache/test_score_objects.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/metrics/tests/test_classification.py
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/test_classification.py
--- a/venv/Lib/site-packages/sklearn/metrics/tests/test_common.py
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/test_common.py
--- a/venv/Lib/site-packages/sklearn/metrics/tests/test_pairwise.py
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/test_pairwise.py
--- a/venv/Lib/site-packages/sklearn/metrics/tests/test_ranking.py
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/test_ranking.py
--- a/venv/Lib/site-packages/sklearn/metrics/tests/test_regression.py
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/test_regression.py
@ -0,0 +1,310 @@
+
+import numpy as np
+from numpy.testing import assert_allclose
+from itertools import product
+import pytest
+
+from sklearn.utils._testing import assert_almost_equal
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_array_almost_equal
+
+from sklearn.metrics import explained_variance_score
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_squared_log_error
+from sklearn.metrics import median_absolute_error
+from sklearn.metrics import max_error
+from sklearn.metrics import r2_score
+from sklearn.metrics import mean_tweedie_deviance
+
+from sklearn.metrics._regression import _check_reg_targets
+
+from ...exceptions import UndefinedMetricWarning
+
+
+def test_regression_metrics(n_samples=50):
+    y_true = np.arange(n_samples)
+    y_pred = y_true + 1
+
+    assert_almost_equal(mean_squared_error(y_true, y_pred), 1.)
+    assert_almost_equal(mean_squared_log_error(y_true, y_pred),
+                        mean_squared_error(np.log(1 + y_true),
+                                           np.log(1 + y_pred)))
+    assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.)
+    assert_almost_equal(median_absolute_error(y_true, y_pred), 1.)
+    assert_almost_equal(max_error(y_true, y_pred), 1.)
+    assert_almost_equal(r2_score(y_true, y_pred),  0.995, 2)
+    assert_almost_equal(explained_variance_score(y_true, y_pred), 1.)
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=0),
+                        mean_squared_error(y_true, y_pred))
+
+    # Tweedie deviance needs positive y_pred, except for p=0,
+    # p>=2 needs positive y_true
+    # results evaluated by sympy
+    y_true = np.arange(1, 1 + n_samples)
+    y_pred = 2 * y_true
+    n = n_samples
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=-1),
+                        5/12 * n * (n**2 + 2 * n + 1))
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=1),
+                        (n + 1) * (1 - np.log(2)))
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=2),
+                        2 * np.log(2) - 1)
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3/2),
+                        ((6 * np.sqrt(2) - 8) / n) * np.sqrt(y_true).sum())
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3),
+                        np.sum(1 / y_true) / (4 * n))
+
+
+def test_mean_squared_error_multioutput_raw_value_squared():
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/pull/16323
+    mse1 = mean_squared_error(
+        [[1]], [[10]], multioutput="raw_values", squared=True
+    )
+    mse2 = mean_squared_error(
+        [[1]], [[10]], multioutput="raw_values", squared=False
+    )
+    assert np.sqrt(mse1) == pytest.approx(mse2)
+
+
+def test_multioutput_regression():
+    y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]])
+    y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]])
+
+    error = mean_squared_error(y_true, y_pred)
+    assert_almost_equal(error, (1. / 3 + 2. / 3 + 2. / 3) / 4.)
+
+    error = mean_squared_error(y_true, y_pred, squared=False)
+    assert_almost_equal(error, 0.454, decimal=2)
+
+    error = mean_squared_log_error(y_true, y_pred)
+    assert_almost_equal(error, 0.200, decimal=2)
+
+    # mean_absolute_error and mean_squared_error are equal because
+    # it is a binary problem.
+    error = mean_absolute_error(y_true, y_pred)
+    assert_almost_equal(error, (1. + 2. / 3) / 4.)
+
+    error = median_absolute_error(y_true, y_pred)
+    assert_almost_equal(error, (1. + 1.) / 4.)
+
+    error = r2_score(y_true, y_pred, multioutput='variance_weighted')
+    assert_almost_equal(error, 1. - 5. / 2)
+    error = r2_score(y_true, y_pred, multioutput='uniform_average')
+    assert_almost_equal(error, -.875)
+
+
+def test_regression_metrics_at_limits():
+    assert_almost_equal(mean_squared_error([0.], [0.]), 0.00, 2)
+    assert_almost_equal(mean_squared_error([0.], [0.], squared=False), 0.00, 2)
+    assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.00, 2)
+    assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2)
+    assert_almost_equal(median_absolute_error([0.], [0.]), 0.00, 2)
+    assert_almost_equal(max_error([0.], [0.]), 0.00, 2)
+    assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2)
+    assert_almost_equal(r2_score([0., 1], [0., 1]), 1.00, 2)
+    err_msg = ("Mean Squared Logarithmic Error cannot be used when targets "
+               "contain negative values.")
+    with pytest.raises(ValueError, match=err_msg):
+        mean_squared_log_error([-1.], [-1.])
+    err_msg = ("Mean Squared Logarithmic Error cannot be used when targets "
+               "contain negative values.")
+    with pytest.raises(ValueError, match=err_msg):
+        mean_squared_log_error([1., 2., 3.], [1., -2., 3.])
+    err_msg = ("Mean Squared Logarithmic Error cannot be used when targets "
+               "contain negative values.")
+    with pytest.raises(ValueError, match=err_msg):
+        mean_squared_log_error([1., -2., 3.], [1., 2., 3.])
+
+    # Tweedie deviance error
+    power = -1.2
+    assert_allclose(mean_tweedie_deviance([0], [1.], power=power),
+                    2 / (2 - power), rtol=1e-3)
+    with pytest.raises(ValueError,
+                       match="can only be used on strictly positive y_pred."):
+        mean_tweedie_deviance([0.], [0.], power=power)
+    assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2)
+
+    msg = "only be used on non-negative y and strictly positive y_pred."
+    with pytest.raises(ValueError, match=msg):
+        mean_tweedie_deviance([0.], [0.], power=1.0)
+
+    power = 1.5
+    assert_allclose(mean_tweedie_deviance([0.], [1.], power=power),
+                    2 / (2 - power))
+    msg = "only be used on non-negative y and strictly positive y_pred."
+    with pytest.raises(ValueError, match=msg):
+        mean_tweedie_deviance([0.], [0.], power=power)
+    power = 2.
+    assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00,
+                    atol=1e-8)
+    msg = "can only be used on strictly positive y and y_pred."
+    with pytest.raises(ValueError, match=msg):
+        mean_tweedie_deviance([0.], [0.], power=power)
+    power = 3.
+    assert_allclose(mean_tweedie_deviance([1.], [1.], power=power),
+                    0.00, atol=1e-8)
+
+    msg = "can only be used on strictly positive y and y_pred."
+    with pytest.raises(ValueError, match=msg):
+        mean_tweedie_deviance([0.], [0.], power=power)
+
+    with pytest.raises(ValueError,
+                       match="is only defined for power<=0 and power>=1"):
+        mean_tweedie_deviance([0.], [0.], power=0.5)
+
+
+def test__check_reg_targets():
+    # All of length 3
+    EXAMPLES = [
+        ("continuous", [1, 2, 3], 1),
+        ("continuous", [[1], [2], [3]], 1),
+        ("continuous-multioutput", [[1, 1], [2, 2], [3, 1]], 2),
+        ("continuous-multioutput", [[5, 1], [4, 2], [3, 1]], 2),
+        ("continuous-multioutput", [[1, 3, 4], [2, 2, 2], [3, 1, 1]], 3),
+    ]
+
+    for (type1, y1, n_out1), (type2, y2, n_out2) in product(EXAMPLES,
+                                                            repeat=2):
+
+        if type1 == type2 and n_out1 == n_out2:
+            y_type, y_check1, y_check2, multioutput = _check_reg_targets(
+                y1, y2, None)
+            assert type1 == y_type
+            if type1 == 'continuous':
+                assert_array_equal(y_check1, np.reshape(y1, (-1, 1)))
+                assert_array_equal(y_check2, np.reshape(y2, (-1, 1)))
+            else:
+                assert_array_equal(y_check1, y1)
+                assert_array_equal(y_check2, y2)
+        else:
+            with pytest.raises(ValueError):
+                _check_reg_targets(y1, y2, None)
+
+
+def test__check_reg_targets_exception():
+    invalid_multioutput = 'this_value_is_not_valid'
+    expected_message = ("Allowed 'multioutput' string values are.+"
+                        "You provided multioutput={!r}".format(
+                            invalid_multioutput))
+    with pytest.raises(ValueError, match=expected_message):
+        _check_reg_targets([1, 2, 3], [[1], [2], [3]], invalid_multioutput)
+
+
+def test_regression_multioutput_array():
+    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
+    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]
+
+    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
+    mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
+    r = r2_score(y_true, y_pred, multioutput='raw_values')
+    evs = explained_variance_score(y_true, y_pred, multioutput='raw_values')
+
+    assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2)
+    assert_array_almost_equal(mae, [0.25, 0.625], decimal=2)
+    assert_array_almost_equal(r, [0.95, 0.93], decimal=2)
+    assert_array_almost_equal(evs, [0.95, 0.93], decimal=2)
+
+    # mean_absolute_error and mean_squared_error are equal because
+    # it is a binary problem.
+    y_true = [[0, 0]]*4
+    y_pred = [[1, 1]]*4
+    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
+    mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
+    r = r2_score(y_true, y_pred, multioutput='raw_values')
+    assert_array_almost_equal(mse, [1., 1.], decimal=2)
+    assert_array_almost_equal(mae, [1., 1.], decimal=2)
+    assert_array_almost_equal(r, [0., 0.], decimal=2)
+
+    r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values')
+    assert_array_almost_equal(r, [0, -3.5], decimal=2)
+    assert np.mean(r) == r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]],
+                                  multioutput='uniform_average')
+    evs = explained_variance_score([[0, -1], [0, 1]], [[2, 2], [1, 1]],
+                                   multioutput='raw_values')
+    assert_array_almost_equal(evs, [0, -1.25], decimal=2)
+
+    # Checking for the condition in which both numerator and denominator is
+    # zero.
+    y_true = [[1, 3], [-1, 2]]
+    y_pred = [[1, 4], [-1, 1]]
+    r2 = r2_score(y_true, y_pred, multioutput='raw_values')
+    assert_array_almost_equal(r2, [1., -3.], decimal=2)
+    assert np.mean(r2) == r2_score(y_true, y_pred,
+                                   multioutput='uniform_average')
+    evs = explained_variance_score(y_true, y_pred, multioutput='raw_values')
+    assert_array_almost_equal(evs, [1., -3.], decimal=2)
+    assert np.mean(evs) == explained_variance_score(y_true, y_pred)
+
+    # Handling msle separately as it does not accept negative inputs.
+    y_true = np.array([[0.5, 1], [1, 2], [7, 6]])
+    y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]])
+    msle = mean_squared_log_error(y_true, y_pred, multioutput='raw_values')
+    msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred),
+                               multioutput='raw_values')
+    assert_array_almost_equal(msle, msle2, decimal=2)
+
+
+def test_regression_custom_weights():
+    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
+    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]
+
+    msew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6])
+    rmsew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6],
+                               squared=False)
+    maew = mean_absolute_error(y_true, y_pred, multioutput=[0.4, 0.6])
+    rw = r2_score(y_true, y_pred, multioutput=[0.4, 0.6])
+    evsw = explained_variance_score(y_true, y_pred, multioutput=[0.4, 0.6])
+
+    assert_almost_equal(msew, 0.39, decimal=2)
+    assert_almost_equal(rmsew, 0.59, decimal=2)
+    assert_almost_equal(maew, 0.475, decimal=3)
+    assert_almost_equal(rw, 0.94, decimal=2)
+    assert_almost_equal(evsw, 0.94, decimal=2)
+
+    # Handling msle separately as it does not accept negative inputs.
+    y_true = np.array([[0.5, 1], [1, 2], [7, 6]])
+    y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]])
+    msle = mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred),
+                               multioutput=[0.3, 0.7])
+    assert_almost_equal(msle, msle2, decimal=2)
+
+
+@pytest.mark.parametrize('metric', [r2_score])
+def test_regression_single_sample(metric):
+    y_true = [0]
+    y_pred = [1]
+    warning_msg = 'not well-defined with less than two samples.'
+
+    # Trigger the warning
+    with pytest.warns(UndefinedMetricWarning, match=warning_msg):
+        score = metric(y_true, y_pred)
+        assert np.isnan(score)
+
+
+def test_tweedie_deviance_continuity():
+    n_samples = 100
+
+    y_true = np.random.RandomState(0).rand(n_samples) + 0.1
+    y_pred = np.random.RandomState(1).rand(n_samples) + 0.1
+
+    assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=0 - 1e-10),
+                    mean_tweedie_deviance(y_true, y_pred, power=0))
+
+    # Ws we get closer to the limit, with 1e-12 difference the absolute
+    # tolerance to pass the below check increases. There are likely
+    # numerical precision issues on the edges of different definition
+    # regions.
+    assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=1 + 1e-10),
+                    mean_tweedie_deviance(y_true, y_pred, power=1),
+                    atol=1e-6)
+
+    assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=2 - 1e-10),
+                    mean_tweedie_deviance(y_true, y_pred, power=2),
+                    atol=1e-6)
+
+    assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=2 + 1e-10),
+                    mean_tweedie_deviance(y_true, y_pred, power=2),
+                    atol=1e-6)
--- a/venv/Lib/site-packages/sklearn/metrics/tests/test_score_objects.py
+++ b/venv/Lib/site-packages/sklearn/metrics/tests/test_score_objects.py
@ -0,0 +1,721 @@
+import pickle
+import tempfile
+import shutil
+import os
+import numbers
+from unittest.mock import Mock
+from functools import partial
+
+import numpy as np
+import pytest
+import joblib
+
+from numpy.testing import assert_allclose
+from sklearn.utils._testing import assert_almost_equal
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import ignore_warnings
+
+from sklearn.base import BaseEstimator
+from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score,
+                             log_loss, precision_score, recall_score,
+                             jaccard_score)
+from sklearn.metrics import cluster as cluster_module
+from sklearn.metrics import check_scoring
+from sklearn.metrics._scorer import (_PredictScorer, _passthrough_scorer,
+                                     _MultimetricScorer,
+                                     _check_multimetric_scoring)
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import make_scorer, get_scorer, SCORERS
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import LinearSVC
+from sklearn.pipeline import make_pipeline
+from sklearn.cluster import KMeans
+from sklearn.linear_model import Ridge, LogisticRegression, Perceptron
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.datasets import make_blobs
+from sklearn.datasets import make_classification
+from sklearn.datasets import make_multilabel_classification
+from sklearn.datasets import load_diabetes
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.model_selection import GridSearchCV
+from sklearn.multiclass import OneVsRestClassifier
+
+
+REGRESSION_SCORERS = ['explained_variance', 'r2',
+                      'neg_mean_absolute_error', 'neg_mean_squared_error',
+                      'neg_mean_squared_log_error',
+                      'neg_median_absolute_error',
+                      'neg_root_mean_squared_error',
+                      'mean_absolute_error',
+                      'mean_squared_error', 'median_absolute_error',
+                      'max_error', 'neg_mean_poisson_deviance',
+                      'neg_mean_gamma_deviance']
+
+CLF_SCORERS = ['accuracy', 'balanced_accuracy',
+               'f1', 'f1_weighted', 'f1_macro', 'f1_micro',
+               'roc_auc', 'average_precision', 'precision',
+               'precision_weighted', 'precision_macro', 'precision_micro',
+               'recall', 'recall_weighted', 'recall_macro', 'recall_micro',
+               'neg_log_loss', 'log_loss', 'neg_brier_score',
+               'jaccard', 'jaccard_weighted', 'jaccard_macro',
+               'jaccard_micro', 'roc_auc_ovr', 'roc_auc_ovo',
+               'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
+
+# All supervised cluster scorers (They behave like classification metric)
+CLUSTER_SCORERS = ["adjusted_rand_score",
+                   "homogeneity_score",
+                   "completeness_score",
+                   "v_measure_score",
+                   "mutual_info_score",
+                   "adjusted_mutual_info_score",
+                   "normalized_mutual_info_score",
+                   "fowlkes_mallows_score"]
+
+MULTILABEL_ONLY_SCORERS = ['precision_samples', 'recall_samples', 'f1_samples',
+                           'jaccard_samples']
+
+REQUIRE_POSITIVE_Y_SCORERS = ['neg_mean_poisson_deviance',
+                              'neg_mean_gamma_deviance']
+
+
+def _require_positive_y(y):
+    """Make targets strictly positive"""
+    offset = abs(y.min()) + 1
+    y = y + offset
+    return y
+
+
+def _make_estimators(X_train, y_train, y_ml_train):
+    # Make estimators that make sense to test various scoring methods
+    sensible_regr = DecisionTreeRegressor(random_state=0)
+    # some of the regressions scorers require strictly positive input.
+    sensible_regr.fit(X_train, y_train + 1)
+    sensible_clf = DecisionTreeClassifier(random_state=0)
+    sensible_clf.fit(X_train, y_train)
+    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
+    sensible_ml_clf.fit(X_train, y_ml_train)
+    return dict(
+        [(name, sensible_regr) for name in REGRESSION_SCORERS] +
+        [(name, sensible_clf) for name in CLF_SCORERS] +
+        [(name, sensible_clf) for name in CLUSTER_SCORERS] +
+        [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
+    )
+
+
+X_mm, y_mm, y_ml_mm = None, None, None
+ESTIMATORS = None
+TEMP_FOLDER = None
+
+
+def setup_module():
+    # Create some memory mapped data
+    global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS
+    TEMP_FOLDER = tempfile.mkdtemp(prefix='sklearn_test_score_objects_')
+    X, y = make_classification(n_samples=30, n_features=5, random_state=0)
+    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
+                                             random_state=0)
+    filename = os.path.join(TEMP_FOLDER, 'test_data.pkl')
+    joblib.dump((X, y, y_ml), filename)
+    X_mm, y_mm, y_ml_mm = joblib.load(filename, mmap_mode='r')
+    ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
+
+
+def teardown_module():
+    global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS
+    # GC closes the mmap file descriptors
+    X_mm, y_mm, y_ml_mm, ESTIMATORS = None, None, None, None
+    shutil.rmtree(TEMP_FOLDER)
+
+
+class EstimatorWithoutFit:
+    """Dummy estimator to test scoring validators"""
+    pass
+
+
+class EstimatorWithFit(BaseEstimator):
+    """Dummy estimator to test scoring validators"""
+    def fit(self, X, y):
+        return self
+
+
+class EstimatorWithFitAndScore:
+    """Dummy estimator to test scoring validators"""
+    def fit(self, X, y):
+        return self
+
+    def score(self, X, y):
+        return 1.0
+
+
+class EstimatorWithFitAndPredict:
+    """Dummy estimator to test scoring validators"""
+    def fit(self, X, y):
+        self.y = y
+        return self
+
+    def predict(self, X):
+        return self.y
+
+
+class DummyScorer:
+    """Dummy scorer that always returns 1."""
+    def __call__(self, est, X, y):
+        return 1
+
+
+def test_all_scorers_repr():
+    # Test that all scorers have a working repr
+    for name, scorer in SCORERS.items():
+        repr(scorer)
+
+
+def check_scoring_validator_for_single_metric_usecases(scoring_validator):
+    # Test all branches of single metric usecases
+    estimator = EstimatorWithoutFit()
+    pattern = (r"estimator should be an estimator implementing 'fit' method,"
+               r" .* was passed")
+    with pytest.raises(TypeError, match=pattern):
+        scoring_validator(estimator)
+
+    estimator = EstimatorWithFitAndScore()
+    estimator.fit([[1]], [1])
+    scorer = scoring_validator(estimator)
+    assert scorer is _passthrough_scorer
+    assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)
+
+    estimator = EstimatorWithFitAndPredict()
+    estimator.fit([[1]], [1])
+    pattern = (r"If no scoring is specified, the estimator passed should have"
+               r" a 'score' method\. The estimator .* does not\.")
+    with pytest.raises(TypeError, match=pattern):
+        scoring_validator(estimator)
+
+    scorer = scoring_validator(estimator, scoring="accuracy")
+    assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)
+
+    estimator = EstimatorWithFit()
+    scorer = scoring_validator(estimator, scoring="accuracy")
+    assert isinstance(scorer, _PredictScorer)
+
+    # Test the allow_none parameter for check_scoring alone
+    if scoring_validator is check_scoring:
+        estimator = EstimatorWithFit()
+        scorer = scoring_validator(estimator, allow_none=True)
+        assert scorer is None
+
+
+def check_multimetric_scoring_single_metric_wrapper(*args, **kwargs):
+    # This wraps the _check_multimetric_scoring to take in
+    # single metric scoring parameter so we can run the tests
+    # that we will run for check_scoring, for check_multimetric_scoring
+    # too for single-metric usecases
+
+    scorers, is_multi = _check_multimetric_scoring(*args, **kwargs)
+    # For all single metric use cases, it should register as not multimetric
+    assert not is_multi
+    if args[0] is not None:
+        assert scorers is not None
+        names, scorers = zip(*scorers.items())
+        assert len(scorers) == 1
+        assert names[0] == 'score'
+        scorers = scorers[0]
+    return scorers
+
+
+def test_check_scoring_and_check_multimetric_scoring():
+    check_scoring_validator_for_single_metric_usecases(check_scoring)
+    # To make sure the check_scoring is correctly applied to the constituent
+    # scorers
+    check_scoring_validator_for_single_metric_usecases(
+        check_multimetric_scoring_single_metric_wrapper)
+
+    # For multiple metric use cases
+    # Make sure it works for the valid cases
+    for scoring in (('accuracy',), ['precision'],
+                    {'acc': 'accuracy', 'precision': 'precision'},
+                    ('accuracy', 'precision'), ['precision', 'accuracy'],
+                    {'accuracy': make_scorer(accuracy_score),
+                     'precision': make_scorer(precision_score)}):
+        estimator = LinearSVC(random_state=0)
+        estimator.fit([[1], [2], [3]], [1, 1, 0])
+
+        scorers, is_multi = _check_multimetric_scoring(estimator, scoring)
+        assert is_multi
+        assert isinstance(scorers, dict)
+        assert sorted(scorers.keys()) == sorted(list(scoring))
+        assert all([isinstance(scorer, _PredictScorer)
+                    for scorer in list(scorers.values())])
+
+        if 'acc' in scoring:
+            assert_almost_equal(scorers['acc'](
+                estimator, [[1], [2], [3]], [1, 0, 0]), 2. / 3.)
+        if 'accuracy' in scoring:
+            assert_almost_equal(scorers['accuracy'](
+                estimator, [[1], [2], [3]], [1, 0, 0]), 2. / 3.)
+        if 'precision' in scoring:
+            assert_almost_equal(scorers['precision'](
+                estimator, [[1], [2], [3]], [1, 0, 0]), 0.5)
+
+    estimator = EstimatorWithFitAndPredict()
+    estimator.fit([[1]], [1])
+
+    # Make sure it raises errors when scoring parameter is not valid.
+    # More weird corner cases are tested at test_validation.py
+    error_message_regexp = ".*must be unique strings.*"
+    for scoring in ((make_scorer(precision_score),  # Tuple of callables
+                     make_scorer(accuracy_score)), [5],
+                    (make_scorer(precision_score),), (), ('f1', 'f1')):
+        with pytest.raises(ValueError, match=error_message_regexp):
+            _check_multimetric_scoring(estimator, scoring=scoring)
+
+
+def test_check_scoring_gridsearchcv():
+    # test that check_scoring works on GridSearchCV and pipeline.
+    # slightly redundant non-regression test.
+
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3)
+    scorer = check_scoring(grid, scoring="f1")
+    assert isinstance(scorer, _PredictScorer)
+
+    pipe = make_pipeline(LinearSVC())
+    scorer = check_scoring(pipe, scoring="f1")
+    assert isinstance(scorer, _PredictScorer)
+
+    # check that cross_val_score definitely calls the scorer
+    # and doesn't make any assumptions about the estimator apart from having a
+    # fit.
+    scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1],
+                             scoring=DummyScorer(), cv=3)
+    assert_array_equal(scores, 1)
+
+
+def test_make_scorer():
+    # Sanity check on the make_scorer factory function.
+    f = lambda *args: 0
+    with pytest.raises(ValueError):
+        make_scorer(f, needs_threshold=True, needs_proba=True)
+
+
+def test_classification_scores():
+    # Test classification scorers.
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = LinearSVC(random_state=0)
+    clf.fit(X_train, y_train)
+
+    for prefix, metric in [('f1', f1_score), ('precision', precision_score),
+                           ('recall', recall_score),
+                           ('jaccard', jaccard_score)]:
+
+        score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test)
+        score2 = metric(y_test, clf.predict(X_test), pos_label=None,
+                        average='weighted')
+        assert_almost_equal(score1, score2)
+
+        score1 = get_scorer('%s_macro' % prefix)(clf, X_test, y_test)
+        score2 = metric(y_test, clf.predict(X_test), pos_label=None,
+                        average='macro')
+        assert_almost_equal(score1, score2)
+
+        score1 = get_scorer('%s_micro' % prefix)(clf, X_test, y_test)
+        score2 = metric(y_test, clf.predict(X_test), pos_label=None,
+                        average='micro')
+        assert_almost_equal(score1, score2)
+
+        score1 = get_scorer('%s' % prefix)(clf, X_test, y_test)
+        score2 = metric(y_test, clf.predict(X_test), pos_label=1)
+        assert_almost_equal(score1, score2)
+
+    # test fbeta score that takes an argument
+    scorer = make_scorer(fbeta_score, beta=2)
+    score1 = scorer(clf, X_test, y_test)
+    score2 = fbeta_score(y_test, clf.predict(X_test), beta=2)
+    assert_almost_equal(score1, score2)
+
+    # test that custom scorer can be pickled
+    unpickled_scorer = pickle.loads(pickle.dumps(scorer))
+    score3 = unpickled_scorer(clf, X_test, y_test)
+    assert_almost_equal(score1, score3)
+
+    # smoke test the repr:
+    repr(fbeta_score)
+
+
+def test_regression_scorers():
+    # Test regression scorers.
+    diabetes = load_diabetes()
+    X, y = diabetes.data, diabetes.target
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = Ridge()
+    clf.fit(X_train, y_train)
+    score1 = get_scorer('r2')(clf, X_test, y_test)
+    score2 = r2_score(y_test, clf.predict(X_test))
+    assert_almost_equal(score1, score2)
+
+
+def test_thresholded_scorers():
+    # Test scorers that take thresholds.
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = LogisticRegression(random_state=0)
+    clf.fit(X_train, y_train)
+    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
+    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
+    assert_almost_equal(score1, score2)
+    assert_almost_equal(score1, score3)
+
+    logscore = get_scorer('neg_log_loss')(clf, X_test, y_test)
+    logloss = log_loss(y_test, clf.predict_proba(X_test))
+    assert_almost_equal(-logscore, logloss)
+
+    # same for an estimator without decision_function
+    clf = DecisionTreeClassifier()
+    clf.fit(X_train, y_train)
+    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
+    assert_almost_equal(score1, score2)
+
+    # test with a regressor (no decision_function)
+    reg = DecisionTreeRegressor()
+    reg.fit(X_train, y_train)
+    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
+    score2 = roc_auc_score(y_test, reg.predict(X_test))
+    assert_almost_equal(score1, score2)
+
+    # Test that an exception is raised on more than two classes
+    X, y = make_blobs(random_state=0, centers=3)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf.fit(X_train, y_train)
+    with pytest.raises(ValueError, match="multiclass format is not supported"):
+        get_scorer('roc_auc')(clf, X_test, y_test)
+
+    # test error is raised with a single class present in model
+    # (predict_proba shape is not suitable for binary auc)
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = DecisionTreeClassifier()
+    clf.fit(X_train, np.zeros_like(y_train))
+    with pytest.raises(ValueError, match="need classifier with two classes"):
+        get_scorer('roc_auc')(clf, X_test, y_test)
+
+    # for proba scorers
+    with pytest.raises(ValueError, match="need classifier with two classes"):
+        get_scorer('neg_log_loss')(clf, X_test, y_test)
+
+
+def test_thresholded_scorers_multilabel_indicator_data():
+    # Test that the scorer work with multilabel-indicator format
+    # for multilabel and multi-output multi-class classifier
+    X, y = make_multilabel_classification(allow_unlabeled=False,
+                                          random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    # Multi-output multi-class predict_proba
+    clf = DecisionTreeClassifier()
+    clf.fit(X_train, y_train)
+    y_proba = clf.predict_proba(X_test)
+    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, np.vstack([p[:, -1] for p in y_proba]).T)
+    assert_almost_equal(score1, score2)
+
+    # Multi-output multi-class decision_function
+    # TODO Is there any yet?
+    clf = DecisionTreeClassifier()
+    clf.fit(X_train, y_train)
+    clf._predict_proba = clf.predict_proba
+    clf.predict_proba = None
+    clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)]
+
+    y_proba = clf.decision_function(X_test)
+    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, np.vstack([p for p in y_proba]).T)
+    assert_almost_equal(score1, score2)
+
+    # Multilabel predict_proba
+    clf = OneVsRestClassifier(DecisionTreeClassifier())
+    clf.fit(X_train, y_train)
+    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, clf.predict_proba(X_test))
+    assert_almost_equal(score1, score2)
+
+    # Multilabel decision function
+    clf = OneVsRestClassifier(LinearSVC(random_state=0))
+    clf.fit(X_train, y_train)
+    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
+    assert_almost_equal(score1, score2)
+
+
+def test_supervised_cluster_scorers():
+    # Test clustering scorers against gold standard labeling.
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    km = KMeans(n_clusters=3)
+    km.fit(X_train)
+    for name in CLUSTER_SCORERS:
+        score1 = get_scorer(name)(km, X_test, y_test)
+        score2 = getattr(cluster_module, name)(y_test, km.predict(X_test))
+        assert_almost_equal(score1, score2)
+
+
+@ignore_warnings
+def test_raises_on_score_list():
+    # Test that when a list of scores is returned, we raise proper errors.
+    X, y = make_blobs(random_state=0)
+    f1_scorer_no_average = make_scorer(f1_score, average=None)
+    clf = DecisionTreeClassifier()
+    with pytest.raises(ValueError):
+        cross_val_score(clf, X, y, scoring=f1_scorer_no_average)
+    grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average,
+                               param_grid={'max_depth': [1, 2]})
+    with pytest.raises(ValueError):
+        grid_search.fit(X, y)
+
+
+@ignore_warnings
+def test_scorer_sample_weight():
+    # Test that scorers support sample_weight or raise sensible errors
+
+    # Unlike the metrics invariance test, in the scorer case it's harder
+    # to ensure that, on the classifier output, weighted and unweighted
+    # scores really should be unequal.
+    X, y = make_classification(random_state=0)
+    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
+                                             random_state=0)
+    split = train_test_split(X, y, y_ml, random_state=0)
+    X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split
+
+    sample_weight = np.ones_like(y_test)
+    sample_weight[:10] = 0
+
+    # get sensible estimators for each metric
+    estimator = _make_estimators(X_train, y_train, y_ml_train)
+
+    for name, scorer in SCORERS.items():
+        if name in MULTILABEL_ONLY_SCORERS:
+            target = y_ml_test
+        else:
+            target = y_test
+        if name in REQUIRE_POSITIVE_Y_SCORERS:
+            target = _require_positive_y(target)
+        try:
+            weighted = scorer(estimator[name], X_test, target,
+                              sample_weight=sample_weight)
+            ignored = scorer(estimator[name], X_test[10:], target[10:])
+            unweighted = scorer(estimator[name], X_test, target)
+            assert weighted != unweighted, (
+                "scorer {0} behaves identically when "
+                "called with sample weights: {1} vs "
+                "{2}".format(name, weighted, unweighted))
+            assert_almost_equal(weighted, ignored,
+                                err_msg="scorer {0} behaves differently when "
+                                "ignoring samples and setting sample_weight to"
+                                " 0: {1} vs {2}".format(name, weighted,
+                                                        ignored))
+
+        except TypeError as e:
+            assert "sample_weight" in str(e), (
+                "scorer {0} raises unhelpful exception when called "
+                "with sample weights: {1}".format(name, str(e)))
+
+
+@pytest.mark.parametrize('name', SCORERS)
+def test_scorer_memmap_input(name):
+    # Non-regression test for #6147: some score functions would
+    # return singleton memmap when computed on memmap data instead of scalar
+    # float values.
+
+    if name in REQUIRE_POSITIVE_Y_SCORERS:
+        y_mm_1 = _require_positive_y(y_mm)
+        y_ml_mm_1 = _require_positive_y(y_ml_mm)
+    else:
+        y_mm_1, y_ml_mm_1 = y_mm, y_ml_mm
+
+    # UndefinedMetricWarning for P / R scores
+    with ignore_warnings():
+        scorer, estimator = SCORERS[name], ESTIMATORS[name]
+        if name in MULTILABEL_ONLY_SCORERS:
+            score = scorer(estimator, X_mm, y_ml_mm_1)
+        else:
+            score = scorer(estimator, X_mm, y_mm_1)
+        assert isinstance(score, numbers.Number), name
+
+
+def test_scoring_is_not_metric():
+    with pytest.raises(ValueError, match='make_scorer'):
+        check_scoring(LogisticRegression(), scoring=f1_score)
+    with pytest.raises(ValueError, match='make_scorer'):
+        check_scoring(LogisticRegression(), scoring=roc_auc_score)
+    with pytest.raises(ValueError, match='make_scorer'):
+        check_scoring(Ridge(), scoring=r2_score)
+    with pytest.raises(ValueError, match='make_scorer'):
+        check_scoring(KMeans(), scoring=cluster_module.adjusted_rand_score)
+
+
+def test_deprecated_scorer():
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = DecisionTreeClassifier()
+    clf.fit(X_train, y_train)
+
+    deprecated_scorer = get_scorer('brier_score_loss')
+    with pytest.warns(FutureWarning):
+        deprecated_scorer(clf, X_test, y_test)
+
+
+@pytest.mark.parametrize(
+    ("scorers,expected_predict_count,"
+     "expected_predict_proba_count,expected_decision_func_count"),
+    [({'a1': 'accuracy', 'a2': 'accuracy',
+       'll1': 'neg_log_loss', 'll2': 'neg_log_loss',
+        'ra1': 'roc_auc', 'ra2': 'roc_auc'}, 1, 1, 1),
+     (['roc_auc', 'accuracy'], 1, 0, 1),
+     (['neg_log_loss', 'accuracy'], 1, 1, 0)])
+def test_multimetric_scorer_calls_method_once(scorers, expected_predict_count,
+                                              expected_predict_proba_count,
+                                              expected_decision_func_count):
+    X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])
+
+    mock_est = Mock()
+    fit_func = Mock(return_value=mock_est)
+    predict_func = Mock(return_value=y)
+
+    pos_proba = np.random.rand(X.shape[0])
+    proba = np.c_[1 - pos_proba, pos_proba]
+    predict_proba_func = Mock(return_value=proba)
+    decision_function_func = Mock(return_value=pos_proba)
+
+    mock_est.fit = fit_func
+    mock_est.predict = predict_func
+    mock_est.predict_proba = predict_proba_func
+    mock_est.decision_function = decision_function_func
+
+    scorer_dict, _ = _check_multimetric_scoring(LogisticRegression(), scorers)
+    multi_scorer = _MultimetricScorer(**scorer_dict)
+    results = multi_scorer(mock_est, X, y)
+
+    assert set(scorers) == set(results)  # compare dict keys
+
+    assert predict_func.call_count == expected_predict_count
+    assert predict_proba_func.call_count == expected_predict_proba_count
+    assert decision_function_func.call_count == expected_decision_func_count
+
+
+def test_multimetric_scorer_calls_method_once_classifier_no_decision():
+    predict_proba_call_cnt = 0
+
+    class MockKNeighborsClassifier(KNeighborsClassifier):
+        def predict_proba(self, X):
+            nonlocal predict_proba_call_cnt
+            predict_proba_call_cnt += 1
+            return super().predict_proba(X)
+
+    X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])
+
+    # no decision function
+    clf = MockKNeighborsClassifier(n_neighbors=1)
+    clf.fit(X, y)
+
+    scorers = ['roc_auc', 'neg_log_loss']
+    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
+    scorer = _MultimetricScorer(**scorer_dict)
+    scorer(clf, X, y)
+
+    assert predict_proba_call_cnt == 1
+
+
+def test_multimetric_scorer_calls_method_once_regressor_threshold():
+    predict_called_cnt = 0
+
+    class MockDecisionTreeRegressor(DecisionTreeRegressor):
+        def predict(self, X):
+            nonlocal predict_called_cnt
+            predict_called_cnt += 1
+            return super().predict(X)
+
+    X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])
+
+    # no decision function
+    clf = MockDecisionTreeRegressor()
+    clf.fit(X, y)
+
+    scorers = {'neg_mse': 'neg_mean_squared_error', 'r2': 'roc_auc'}
+    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
+    scorer = _MultimetricScorer(**scorer_dict)
+    scorer(clf, X, y)
+
+    assert predict_called_cnt == 1
+
+
+def test_multimetric_scorer_sanity_check():
+    # scoring dictionary returned is the same as calling each scorer separately
+    scorers = {'a1': 'accuracy', 'a2': 'accuracy',
+               'll1': 'neg_log_loss', 'll2': 'neg_log_loss',
+               'ra1': 'roc_auc', 'ra2': 'roc_auc'}
+
+    X, y = make_classification(random_state=0)
+
+    clf = DecisionTreeClassifier()
+    clf.fit(X, y)
+
+    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
+    multi_scorer = _MultimetricScorer(**scorer_dict)
+
+    result = multi_scorer(clf, X, y)
+
+    separate_scores = {
+        name: get_scorer(name)(clf, X, y)
+        for name in ['accuracy', 'neg_log_loss', 'roc_auc']}
+
+    for key, value in result.items():
+        score_name = scorers[key]
+        assert_allclose(value, separate_scores[score_name])
+
+
+@pytest.mark.parametrize('scorer_name, metric', [
+    ('roc_auc_ovr', partial(roc_auc_score, multi_class='ovr')),
+    ('roc_auc_ovo', partial(roc_auc_score, multi_class='ovo')),
+    ('roc_auc_ovr_weighted', partial(roc_auc_score, multi_class='ovr',
+                                     average='weighted')),
+    ('roc_auc_ovo_weighted', partial(roc_auc_score, multi_class='ovo',
+                                     average='weighted'))])
+def test_multiclass_roc_proba_scorer(scorer_name, metric):
+    scorer = get_scorer(scorer_name)
+    X, y = make_classification(n_classes=3, n_informative=3, n_samples=20,
+                               random_state=0)
+    lr = LogisticRegression(multi_class="multinomial").fit(X, y)
+    y_proba = lr.predict_proba(X)
+    expected_score = metric(y, y_proba)
+
+    assert scorer(lr, X, y) == pytest.approx(expected_score)
+
+
+def test_multiclass_roc_proba_scorer_label():
+    scorer = make_scorer(roc_auc_score, multi_class='ovo',
+                         labels=[0, 1, 2], needs_proba=True)
+    X, y = make_classification(n_classes=3, n_informative=3, n_samples=20,
+                               random_state=0)
+    lr = LogisticRegression(multi_class="multinomial").fit(X, y)
+    y_proba = lr.predict_proba(X)
+
+    y_binary = y == 0
+    expected_score = roc_auc_score(y_binary, y_proba,
+                                   multi_class='ovo',
+                                   labels=[0, 1, 2])
+
+    assert scorer(lr, X, y_binary) == pytest.approx(expected_score)
+
+
+@pytest.mark.parametrize('scorer_name', [
+    'roc_auc_ovr', 'roc_auc_ovo',
+    'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted'])
+def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
+    # Perceptron has no predict_proba
+    scorer = get_scorer(scorer_name)
+    X, y = make_classification(n_classes=3, n_informative=3, n_samples=20,
+                               random_state=0)
+    lr = Perceptron().fit(X, y)
+    msg = "'Perceptron' object has no attribute 'predict_proba'"
+    with pytest.raises(AttributeError, match=msg):
+        scorer(lr, X, y)