1966 lines
71 KiB
Python
1966 lines
71 KiB
Python
"""
|
|
Testing for the tree module (sklearn.tree).
|
|
"""
|
|
import copy
|
|
import pickle
|
|
from itertools import product
|
|
import struct
|
|
|
|
import pytest
|
|
import numpy as np
|
|
from scipy.sparse import csc_matrix
|
|
from scipy.sparse import csr_matrix
|
|
from scipy.sparse import coo_matrix
|
|
|
|
from sklearn.random_projection import _sparse_random_matrix
|
|
|
|
from sklearn.metrics import accuracy_score
|
|
from sklearn.metrics import mean_squared_error
|
|
|
|
from sklearn.utils._testing import assert_allclose
|
|
from sklearn.utils._testing import assert_array_equal
|
|
from sklearn.utils._testing import assert_array_almost_equal
|
|
from sklearn.utils._testing import assert_almost_equal
|
|
from sklearn.utils._testing import assert_warns
|
|
from sklearn.utils._testing import assert_warns_message
|
|
from sklearn.utils._testing import create_memmap_backed_data
|
|
from sklearn.utils._testing import ignore_warnings
|
|
|
|
from sklearn.utils.validation import check_random_state
|
|
|
|
from sklearn.exceptions import NotFittedError
|
|
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
from sklearn.tree import DecisionTreeRegressor
|
|
from sklearn.tree import ExtraTreeClassifier
|
|
from sklearn.tree import ExtraTreeRegressor
|
|
|
|
from sklearn import tree
|
|
from sklearn.tree._tree import TREE_LEAF, TREE_UNDEFINED
|
|
from sklearn.tree._classes import CRITERIA_CLF
|
|
from sklearn.tree._classes import CRITERIA_REG
|
|
from sklearn import datasets
|
|
|
|
from sklearn.utils import compute_sample_weight
|
|
|
|
CLF_CRITERIONS = ("gini", "entropy")
|
|
REG_CRITERIONS = ("mse", "mae", "friedman_mse")
|
|
|
|
CLF_TREES = {
|
|
"DecisionTreeClassifier": DecisionTreeClassifier,
|
|
"ExtraTreeClassifier": ExtraTreeClassifier,
|
|
}
|
|
|
|
REG_TREES = {
|
|
"DecisionTreeRegressor": DecisionTreeRegressor,
|
|
"ExtraTreeRegressor": ExtraTreeRegressor,
|
|
}
|
|
|
|
ALL_TREES = dict()
|
|
ALL_TREES.update(CLF_TREES)
|
|
ALL_TREES.update(REG_TREES)
|
|
|
|
SPARSE_TREES = ["DecisionTreeClassifier", "DecisionTreeRegressor",
|
|
"ExtraTreeClassifier", "ExtraTreeRegressor"]
|
|
|
|
|
|
X_small = np.array([
|
|
[0, 0, 4, 0, 0, 0, 1, -14, 0, -4, 0, 0, 0, 0, ],
|
|
[0, 0, 5, 3, 0, -4, 0, 0, 1, -5, 0.2, 0, 4, 1, ],
|
|
[-1, -1, 0, 0, -4.5, 0, 0, 2.1, 1, 0, 0, -4.5, 0, 1, ],
|
|
[-1, -1, 0, -1.2, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 1, ],
|
|
[-1, -1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, ],
|
|
[-1, -2, 0, 4, -3, 10, 4, 0, -3.2, 0, 4, 3, -4, 1, ],
|
|
[2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -3, 1, ],
|
|
[2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1, ],
|
|
[2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1, ],
|
|
[2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -1, 0, ],
|
|
[2, 8, 5, 1, 0.5, -4, 10, 0, 1, -5, 3, 0, 2, 0, ],
|
|
[2, 0, 1, 1, 1, -1, 1, 0, 0, -2, 3, 0, 1, 0, ],
|
|
[2, 0, 1, 2, 3, -1, 10, 2, 0, -1, 1, 2, 2, 0, ],
|
|
[1, 1, 0, 2, 2, -1, 1, 2, 0, -5, 1, 2, 3, 0, ],
|
|
[3, 1, 0, 3, 0, -4, 10, 0, 1, -5, 3, 0, 3, 1, ],
|
|
[2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 0.5, 0, -3, 1, ],
|
|
[2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 1.5, 1, -1, -1, ],
|
|
[2.11, 8, -6, -0.5, 0, 10, 0, 0, -3.2, 6, 0.5, 0, -1, -1, ],
|
|
[2, 0, 5, 1, 0.5, -2, 10, 0, 1, -5, 3, 1, 0, -1, ],
|
|
[2, 0, 1, 1, 1, -2, 1, 0, 0, -2, 0, 0, 0, 1, ],
|
|
[2, 1, 1, 1, 2, -1, 10, 2, 0, -1, 0, 2, 1, 1, ],
|
|
[1, 1, 0, 0, 1, -3, 1, 2, 0, -5, 1, 2, 1, 1, ],
|
|
[3, 1, 0, 1, 0, -4, 1, 0, 1, -2, 0, 0, 1, 0, ]])
|
|
|
|
y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
|
|
0, 0]
|
|
y_small_reg = [1.0, 2.1, 1.2, 0.05, 10, 2.4, 3.1, 1.01, 0.01, 2.98, 3.1, 1.1,
|
|
0.0, 1.2, 2, 11, 0, 0, 4.5, 0.201, 1.06, 0.9, 0]
|
|
|
|
# toy sample
|
|
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
|
y = [-1, -1, -1, 1, 1, 1]
|
|
T = [[-1, -1], [2, 2], [3, 2]]
|
|
true_result = [-1, 1, 1]
|
|
|
|
# also load the iris dataset
|
|
# and randomly permute it
|
|
iris = datasets.load_iris()
|
|
rng = np.random.RandomState(1)
|
|
perm = rng.permutation(iris.target.size)
|
|
iris.data = iris.data[perm]
|
|
iris.target = iris.target[perm]
|
|
|
|
# also load the boston dataset
|
|
# and randomly permute it
|
|
boston = datasets.load_boston()
|
|
perm = rng.permutation(boston.target.size)
|
|
boston.data = boston.data[perm]
|
|
boston.target = boston.target[perm]
|
|
|
|
digits = datasets.load_digits()
|
|
perm = rng.permutation(digits.target.size)
|
|
digits.data = digits.data[perm]
|
|
digits.target = digits.target[perm]
|
|
|
|
random_state = check_random_state(0)
|
|
X_multilabel, y_multilabel = datasets.make_multilabel_classification(
|
|
random_state=0, n_samples=30, n_features=10)
|
|
|
|
# NB: despite their names X_sparse_* are numpy arrays (and not sparse matrices)
|
|
X_sparse_pos = random_state.uniform(size=(20, 5))
|
|
X_sparse_pos[X_sparse_pos <= 0.8] = 0.
|
|
y_random = random_state.randint(0, 4, size=(20, ))
|
|
X_sparse_mix = _sparse_random_matrix(20, 10, density=0.25,
|
|
random_state=0).toarray()
|
|
|
|
|
|
DATASETS = {
|
|
"iris": {"X": iris.data, "y": iris.target},
|
|
"boston": {"X": boston.data, "y": boston.target},
|
|
"digits": {"X": digits.data, "y": digits.target},
|
|
"toy": {"X": X, "y": y},
|
|
"clf_small": {"X": X_small, "y": y_small},
|
|
"reg_small": {"X": X_small, "y": y_small_reg},
|
|
"multilabel": {"X": X_multilabel, "y": y_multilabel},
|
|
"sparse-pos": {"X": X_sparse_pos, "y": y_random},
|
|
"sparse-neg": {"X": - X_sparse_pos, "y": y_random},
|
|
"sparse-mix": {"X": X_sparse_mix, "y": y_random},
|
|
"zeros": {"X": np.zeros((20, 3)), "y": y_random}
|
|
}
|
|
|
|
for name in DATASETS:
|
|
DATASETS[name]["X_sparse"] = csc_matrix(DATASETS[name]["X"])
|
|
|
|
|
|
def assert_tree_equal(d, s, message):
|
|
assert s.node_count == d.node_count, (
|
|
"{0}: inequal number of node ({1} != {2})"
|
|
"".format(message, s.node_count, d.node_count))
|
|
|
|
assert_array_equal(d.children_right, s.children_right,
|
|
message + ": inequal children_right")
|
|
assert_array_equal(d.children_left, s.children_left,
|
|
message + ": inequal children_left")
|
|
|
|
external = d.children_right == TREE_LEAF
|
|
internal = np.logical_not(external)
|
|
|
|
assert_array_equal(d.feature[internal], s.feature[internal],
|
|
message + ": inequal features")
|
|
assert_array_equal(d.threshold[internal], s.threshold[internal],
|
|
message + ": inequal threshold")
|
|
assert_array_equal(d.n_node_samples.sum(), s.n_node_samples.sum(),
|
|
message + ": inequal sum(n_node_samples)")
|
|
assert_array_equal(d.n_node_samples, s.n_node_samples,
|
|
message + ": inequal n_node_samples")
|
|
|
|
assert_almost_equal(d.impurity, s.impurity,
|
|
err_msg=message + ": inequal impurity")
|
|
|
|
assert_array_almost_equal(d.value[external], s.value[external],
|
|
err_msg=message + ": inequal value")
|
|
|
|
|
|
def test_classification_toy():
|
|
# Check classification on a toy dataset.
|
|
for name, Tree in CLF_TREES.items():
|
|
clf = Tree(random_state=0)
|
|
clf.fit(X, y)
|
|
assert_array_equal(clf.predict(T), true_result,
|
|
"Failed with {0}".format(name))
|
|
|
|
clf = Tree(max_features=1, random_state=1)
|
|
clf.fit(X, y)
|
|
assert_array_equal(clf.predict(T), true_result,
|
|
"Failed with {0}".format(name))
|
|
|
|
|
|
def test_weighted_classification_toy():
|
|
# Check classification on a weighted toy dataset.
|
|
for name, Tree in CLF_TREES.items():
|
|
clf = Tree(random_state=0)
|
|
|
|
clf.fit(X, y, sample_weight=np.ones(len(X)))
|
|
assert_array_equal(clf.predict(T), true_result,
|
|
"Failed with {0}".format(name))
|
|
|
|
clf.fit(X, y, sample_weight=np.full(len(X), 0.5))
|
|
assert_array_equal(clf.predict(T), true_result,
|
|
"Failed with {0}".format(name))
|
|
|
|
|
|
def test_regression_toy():
|
|
# Check regression on a toy dataset.
|
|
for name, Tree in REG_TREES.items():
|
|
reg = Tree(random_state=1)
|
|
reg.fit(X, y)
|
|
assert_almost_equal(reg.predict(T), true_result,
|
|
err_msg="Failed with {0}".format(name))
|
|
|
|
clf = Tree(max_features=1, random_state=1)
|
|
clf.fit(X, y)
|
|
assert_almost_equal(reg.predict(T), true_result,
|
|
err_msg="Failed with {0}".format(name))
|
|
|
|
|
|
def test_xor():
|
|
# Check on a XOR problem
|
|
y = np.zeros((10, 10))
|
|
y[:5, :5] = 1
|
|
y[5:, 5:] = 1
|
|
|
|
gridx, gridy = np.indices(y.shape)
|
|
|
|
X = np.vstack([gridx.ravel(), gridy.ravel()]).T
|
|
y = y.ravel()
|
|
|
|
for name, Tree in CLF_TREES.items():
|
|
clf = Tree(random_state=0)
|
|
clf.fit(X, y)
|
|
assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
|
|
|
|
clf = Tree(random_state=0, max_features=1)
|
|
clf.fit(X, y)
|
|
assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
|
|
|
|
|
|
def test_iris():
|
|
# Check consistency on dataset iris.
|
|
for (name, Tree), criterion in product(CLF_TREES.items(), CLF_CRITERIONS):
|
|
clf = Tree(criterion=criterion, random_state=0)
|
|
clf.fit(iris.data, iris.target)
|
|
score = accuracy_score(clf.predict(iris.data), iris.target)
|
|
assert score > 0.9, (
|
|
"Failed with {0}, criterion = {1} and score = {2}"
|
|
"".format(name, criterion, score))
|
|
|
|
clf = Tree(criterion=criterion, max_features=2, random_state=0)
|
|
clf.fit(iris.data, iris.target)
|
|
score = accuracy_score(clf.predict(iris.data), iris.target)
|
|
assert score > 0.5, (
|
|
"Failed with {0}, criterion = {1} and score = {2}"
|
|
"".format(name, criterion, score))
|
|
|
|
|
|
def test_boston():
|
|
# Check consistency on dataset boston house prices.
|
|
|
|
for (name, Tree), criterion in product(REG_TREES.items(), REG_CRITERIONS):
|
|
reg = Tree(criterion=criterion, random_state=0)
|
|
reg.fit(boston.data, boston.target)
|
|
score = mean_squared_error(boston.target, reg.predict(boston.data))
|
|
assert score < 1, (
|
|
"Failed with {0}, criterion = {1} and score = {2}"
|
|
"".format(name, criterion, score))
|
|
|
|
# using fewer features reduces the learning ability of this tree,
|
|
# but reduces training time.
|
|
reg = Tree(criterion=criterion, max_features=6, random_state=0)
|
|
reg.fit(boston.data, boston.target)
|
|
score = mean_squared_error(boston.target, reg.predict(boston.data))
|
|
assert score < 2, (
|
|
"Failed with {0}, criterion = {1} and score = {2}"
|
|
"".format(name, criterion, score))
|
|
|
|
|
|
def test_probability():
|
|
# Predict probabilities using DecisionTreeClassifier.
|
|
|
|
for name, Tree in CLF_TREES.items():
|
|
clf = Tree(max_depth=1, max_features=1, random_state=42)
|
|
clf.fit(iris.data, iris.target)
|
|
|
|
prob_predict = clf.predict_proba(iris.data)
|
|
assert_array_almost_equal(np.sum(prob_predict, 1),
|
|
np.ones(iris.data.shape[0]),
|
|
err_msg="Failed with {0}".format(name))
|
|
assert_array_equal(np.argmax(prob_predict, 1),
|
|
clf.predict(iris.data),
|
|
err_msg="Failed with {0}".format(name))
|
|
assert_almost_equal(clf.predict_proba(iris.data),
|
|
np.exp(clf.predict_log_proba(iris.data)), 8,
|
|
err_msg="Failed with {0}".format(name))
|
|
|
|
|
|
def test_arrayrepr():
|
|
# Check the array representation.
|
|
# Check resize
|
|
X = np.arange(10000)[:, np.newaxis]
|
|
y = np.arange(10000)
|
|
|
|
for name, Tree in REG_TREES.items():
|
|
reg = Tree(max_depth=None, random_state=0)
|
|
reg.fit(X, y)
|
|
|
|
|
|
def test_pure_set():
|
|
# Check when y is pure.
|
|
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
|
y = [1, 1, 1, 1, 1, 1]
|
|
|
|
for name, TreeClassifier in CLF_TREES.items():
|
|
clf = TreeClassifier(random_state=0)
|
|
clf.fit(X, y)
|
|
assert_array_equal(clf.predict(X), y,
|
|
err_msg="Failed with {0}".format(name))
|
|
|
|
for name, TreeRegressor in REG_TREES.items():
|
|
reg = TreeRegressor(random_state=0)
|
|
reg.fit(X, y)
|
|
assert_almost_equal(reg.predict(X), y,
|
|
err_msg="Failed with {0}".format(name))
|
|
|
|
|
|
def test_numerical_stability():
|
|
# Check numerical stability.
|
|
X = np.array([
|
|
[152.08097839, 140.40744019, 129.75102234, 159.90493774],
|
|
[142.50700378, 135.81935120, 117.82884979, 162.75781250],
|
|
[127.28772736, 140.40744019, 129.75102234, 159.90493774],
|
|
[132.37025452, 143.71923828, 138.35694885, 157.84558105],
|
|
[103.10237122, 143.71928406, 138.35696411, 157.84559631],
|
|
[127.71276855, 143.71923828, 138.35694885, 157.84558105],
|
|
[120.91514587, 140.40744019, 129.75102234, 159.90493774]])
|
|
|
|
y = np.array(
|
|
[1., 0.70209277, 0.53896582, 0., 0.90914464, 0.48026916, 0.49622521])
|
|
|
|
with np.errstate(all="raise"):
|
|
for name, Tree in REG_TREES.items():
|
|
reg = Tree(random_state=0)
|
|
reg.fit(X, y)
|
|
reg.fit(X, -y)
|
|
reg.fit(-X, y)
|
|
reg.fit(-X, -y)
|
|
|
|
|
|
def test_importances():
|
|
# Check variable importances.
|
|
X, y = datasets.make_classification(n_samples=5000,
|
|
n_features=10,
|
|
n_informative=3,
|
|
n_redundant=0,
|
|
n_repeated=0,
|
|
shuffle=False,
|
|
random_state=0)
|
|
|
|
for name, Tree in CLF_TREES.items():
|
|
clf = Tree(random_state=0)
|
|
|
|
clf.fit(X, y)
|
|
importances = clf.feature_importances_
|
|
n_important = np.sum(importances > 0.1)
|
|
|
|
assert importances.shape[0] == 10, "Failed with {0}".format(name)
|
|
assert n_important == 3, "Failed with {0}".format(name)
|
|
|
|
# Check on iris that importances are the same for all builders
|
|
clf = DecisionTreeClassifier(random_state=0)
|
|
clf.fit(iris.data, iris.target)
|
|
clf2 = DecisionTreeClassifier(random_state=0,
|
|
max_leaf_nodes=len(iris.data))
|
|
clf2.fit(iris.data, iris.target)
|
|
|
|
assert_array_equal(clf.feature_importances_,
|
|
clf2.feature_importances_)
|
|
|
|
|
|
def test_importances_raises():
|
|
# Check if variable importance before fit raises ValueError.
|
|
clf = DecisionTreeClassifier()
|
|
with pytest.raises(ValueError):
|
|
getattr(clf, 'feature_importances_')
|
|
|
|
|
|
def test_importances_gini_equal_mse():
|
|
# Check that gini is equivalent to mse for binary output variable
|
|
|
|
X, y = datasets.make_classification(n_samples=2000,
|
|
n_features=10,
|
|
n_informative=3,
|
|
n_redundant=0,
|
|
n_repeated=0,
|
|
shuffle=False,
|
|
random_state=0)
|
|
|
|
# The gini index and the mean square error (variance) might differ due
|
|
# to numerical instability. Since those instabilities mainly occurs at
|
|
# high tree depth, we restrict this maximal depth.
|
|
clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
|
|
random_state=0).fit(X, y)
|
|
reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
|
|
random_state=0).fit(X, y)
|
|
|
|
assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
|
|
assert_array_equal(clf.tree_.feature, reg.tree_.feature)
|
|
assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
|
|
assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
|
|
assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
|
|
|
|
|
|
def test_max_features():
|
|
# Check max_features.
|
|
for name, TreeRegressor in REG_TREES.items():
|
|
reg = TreeRegressor(max_features="auto")
|
|
reg.fit(boston.data, boston.target)
|
|
assert reg.max_features_ == boston.data.shape[1]
|
|
|
|
for name, TreeClassifier in CLF_TREES.items():
|
|
clf = TreeClassifier(max_features="auto")
|
|
clf.fit(iris.data, iris.target)
|
|
assert clf.max_features_ == 2
|
|
|
|
for name, TreeEstimator in ALL_TREES.items():
|
|
est = TreeEstimator(max_features="sqrt")
|
|
est.fit(iris.data, iris.target)
|
|
assert (est.max_features_ ==
|
|
int(np.sqrt(iris.data.shape[1])))
|
|
|
|
est = TreeEstimator(max_features="log2")
|
|
est.fit(iris.data, iris.target)
|
|
assert (est.max_features_ ==
|
|
int(np.log2(iris.data.shape[1])))
|
|
|
|
est = TreeEstimator(max_features=1)
|
|
est.fit(iris.data, iris.target)
|
|
assert est.max_features_ == 1
|
|
|
|
est = TreeEstimator(max_features=3)
|
|
est.fit(iris.data, iris.target)
|
|
assert est.max_features_ == 3
|
|
|
|
est = TreeEstimator(max_features=0.01)
|
|
est.fit(iris.data, iris.target)
|
|
assert est.max_features_ == 1
|
|
|
|
est = TreeEstimator(max_features=0.5)
|
|
est.fit(iris.data, iris.target)
|
|
assert (est.max_features_ ==
|
|
int(0.5 * iris.data.shape[1]))
|
|
|
|
est = TreeEstimator(max_features=1.0)
|
|
est.fit(iris.data, iris.target)
|
|
assert est.max_features_ == iris.data.shape[1]
|
|
|
|
est = TreeEstimator(max_features=None)
|
|
est.fit(iris.data, iris.target)
|
|
assert est.max_features_ == iris.data.shape[1]
|
|
|
|
# use values of max_features that are invalid
|
|
est = TreeEstimator(max_features=10)
|
|
with pytest.raises(ValueError):
|
|
est.fit(X, y)
|
|
|
|
est = TreeEstimator(max_features=-1)
|
|
with pytest.raises(ValueError):
|
|
est.fit(X, y)
|
|
|
|
est = TreeEstimator(max_features=0.0)
|
|
with pytest.raises(ValueError):
|
|
est.fit(X, y)
|
|
|
|
est = TreeEstimator(max_features=1.5)
|
|
with pytest.raises(ValueError):
|
|
est.fit(X, y)
|
|
|
|
est = TreeEstimator(max_features="foobar")
|
|
with pytest.raises(ValueError):
|
|
est.fit(X, y)
|
|
|
|
|
|
def test_error():
|
|
# Test that it gives proper exception on deficient input.
|
|
for name, TreeEstimator in CLF_TREES.items():
|
|
# predict before fit
|
|
est = TreeEstimator()
|
|
with pytest.raises(NotFittedError):
|
|
est.predict_proba(X)
|
|
|
|
est.fit(X, y)
|
|
X2 = [[-2, -1, 1]] # wrong feature shape for sample
|
|
with pytest.raises(ValueError):
|
|
est.predict_proba(X2)
|
|
|
|
for name, TreeEstimator in ALL_TREES.items():
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_samples_leaf=-1).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_samples_leaf=.6).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_samples_leaf=0.).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_samples_leaf=3.).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_weight_fraction_leaf=-1).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_weight_fraction_leaf=0.51).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_samples_split=-1).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_samples_split=0.0).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_samples_split=1.1).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_samples_split=2.5).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(max_depth=-1).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(max_features=42).fit(X, y)
|
|
# min_impurity_split warning
|
|
with ignore_warnings(category=FutureWarning):
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_impurity_split=-1.0).fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(min_impurity_decrease=-1.0).fit(X, y)
|
|
|
|
# Wrong dimensions
|
|
est = TreeEstimator()
|
|
y2 = y[:-1]
|
|
with pytest.raises(ValueError):
|
|
est.fit(X, y2)
|
|
|
|
# Test with arrays that are non-contiguous.
|
|
Xf = np.asfortranarray(X)
|
|
est = TreeEstimator()
|
|
est.fit(Xf, y)
|
|
assert_almost_equal(est.predict(T), true_result)
|
|
|
|
# predict before fitting
|
|
est = TreeEstimator()
|
|
with pytest.raises(NotFittedError):
|
|
est.predict(T)
|
|
|
|
# predict on vector with different dims
|
|
est.fit(X, y)
|
|
t = np.asarray(T)
|
|
with pytest.raises(ValueError):
|
|
est.predict(t[:, 1:])
|
|
|
|
# wrong sample shape
|
|
Xt = np.array(X).T
|
|
|
|
est = TreeEstimator()
|
|
est.fit(np.dot(X, Xt), y)
|
|
with pytest.raises(ValueError):
|
|
est.predict(X)
|
|
with pytest.raises(ValueError):
|
|
est.apply(X)
|
|
|
|
clf = TreeEstimator()
|
|
clf.fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
clf.predict(Xt)
|
|
with pytest.raises(ValueError):
|
|
clf.apply(Xt)
|
|
|
|
# apply before fitting
|
|
est = TreeEstimator()
|
|
with pytest.raises(NotFittedError):
|
|
est.apply(T)
|
|
|
|
|
|
def test_min_samples_split():
|
|
"""Test min_samples_split parameter"""
|
|
X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
|
|
y = iris.target
|
|
|
|
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
|
|
# by setting max_leaf_nodes
|
|
for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
|
|
TreeEstimator = ALL_TREES[name]
|
|
|
|
# test for integer parameter
|
|
est = TreeEstimator(min_samples_split=10,
|
|
max_leaf_nodes=max_leaf_nodes,
|
|
random_state=0)
|
|
est.fit(X, y)
|
|
# count samples on nodes, -1 means it is a leaf
|
|
node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
|
|
|
|
assert np.min(node_samples) > 9, "Failed with {0}".format(name)
|
|
|
|
# test for float parameter
|
|
est = TreeEstimator(min_samples_split=0.2,
|
|
max_leaf_nodes=max_leaf_nodes,
|
|
random_state=0)
|
|
est.fit(X, y)
|
|
# count samples on nodes, -1 means it is a leaf
|
|
node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
|
|
|
|
assert np.min(node_samples) > 9, "Failed with {0}".format(name)
|
|
|
|
|
|
def test_min_samples_leaf():
|
|
# Test if leaves contain more than leaf_count training examples
|
|
X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
|
|
y = iris.target
|
|
|
|
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
|
|
# by setting max_leaf_nodes
|
|
for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
|
|
TreeEstimator = ALL_TREES[name]
|
|
|
|
# test integer parameter
|
|
est = TreeEstimator(min_samples_leaf=5,
|
|
max_leaf_nodes=max_leaf_nodes,
|
|
random_state=0)
|
|
est.fit(X, y)
|
|
out = est.tree_.apply(X)
|
|
node_counts = np.bincount(out)
|
|
# drop inner nodes
|
|
leaf_count = node_counts[node_counts != 0]
|
|
assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
|
|
|
|
# test float parameter
|
|
est = TreeEstimator(min_samples_leaf=0.1,
|
|
max_leaf_nodes=max_leaf_nodes,
|
|
random_state=0)
|
|
est.fit(X, y)
|
|
out = est.tree_.apply(X)
|
|
node_counts = np.bincount(out)
|
|
# drop inner nodes
|
|
leaf_count = node_counts[node_counts != 0]
|
|
assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
|
|
|
|
|
|
def check_min_weight_fraction_leaf(name, datasets, sparse=False):
|
|
"""Test if leaves contain at least min_weight_fraction_leaf of the
|
|
training set"""
|
|
if sparse:
|
|
X = DATASETS[datasets]["X_sparse"].astype(np.float32)
|
|
else:
|
|
X = DATASETS[datasets]["X"].astype(np.float32)
|
|
y = DATASETS[datasets]["y"]
|
|
|
|
weights = rng.rand(X.shape[0])
|
|
total_weight = np.sum(weights)
|
|
|
|
TreeEstimator = ALL_TREES[name]
|
|
|
|
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
|
|
# by setting max_leaf_nodes
|
|
for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
|
|
est = TreeEstimator(min_weight_fraction_leaf=frac,
|
|
max_leaf_nodes=max_leaf_nodes,
|
|
random_state=0)
|
|
est.fit(X, y, sample_weight=weights)
|
|
|
|
if sparse:
|
|
out = est.tree_.apply(X.tocsr())
|
|
|
|
else:
|
|
out = est.tree_.apply(X)
|
|
|
|
node_weights = np.bincount(out, weights=weights)
|
|
# drop inner nodes
|
|
leaf_weights = node_weights[node_weights != 0]
|
|
assert (
|
|
np.min(leaf_weights) >=
|
|
total_weight * est.min_weight_fraction_leaf), (
|
|
"Failed with {0} min_weight_fraction_leaf={1}".format(
|
|
name, est.min_weight_fraction_leaf))
|
|
|
|
# test case with no weights passed in
|
|
total_weight = X.shape[0]
|
|
|
|
for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
|
|
est = TreeEstimator(min_weight_fraction_leaf=frac,
|
|
max_leaf_nodes=max_leaf_nodes,
|
|
random_state=0)
|
|
est.fit(X, y)
|
|
|
|
if sparse:
|
|
out = est.tree_.apply(X.tocsr())
|
|
else:
|
|
out = est.tree_.apply(X)
|
|
|
|
node_weights = np.bincount(out)
|
|
# drop inner nodes
|
|
leaf_weights = node_weights[node_weights != 0]
|
|
assert (
|
|
np.min(leaf_weights) >=
|
|
total_weight * est.min_weight_fraction_leaf), (
|
|
"Failed with {0} min_weight_fraction_leaf={1}".format(
|
|
name, est.min_weight_fraction_leaf))
|
|
|
|
|
|
@pytest.mark.parametrize("name", ALL_TREES)
|
|
def test_min_weight_fraction_leaf_on_dense_input(name):
|
|
check_min_weight_fraction_leaf(name, "iris")
|
|
|
|
|
|
@pytest.mark.parametrize("name", SPARSE_TREES)
|
|
def test_min_weight_fraction_leaf_on_sparse_input(name):
|
|
check_min_weight_fraction_leaf(name, "multilabel", True)
|
|
|
|
|
|
def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets,
|
|
sparse=False):
|
|
"""Test the interaction between min_weight_fraction_leaf and min_samples_leaf
|
|
when sample_weights is not provided in fit."""
|
|
if sparse:
|
|
X = DATASETS[datasets]["X_sparse"].astype(np.float32)
|
|
else:
|
|
X = DATASETS[datasets]["X"].astype(np.float32)
|
|
y = DATASETS[datasets]["y"]
|
|
|
|
total_weight = X.shape[0]
|
|
TreeEstimator = ALL_TREES[name]
|
|
for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)):
|
|
# test integer min_samples_leaf
|
|
est = TreeEstimator(min_weight_fraction_leaf=frac,
|
|
max_leaf_nodes=max_leaf_nodes,
|
|
min_samples_leaf=5,
|
|
random_state=0)
|
|
est.fit(X, y)
|
|
|
|
if sparse:
|
|
out = est.tree_.apply(X.tocsr())
|
|
else:
|
|
out = est.tree_.apply(X)
|
|
|
|
node_weights = np.bincount(out)
|
|
# drop inner nodes
|
|
leaf_weights = node_weights[node_weights != 0]
|
|
assert (
|
|
np.min(leaf_weights) >=
|
|
max((total_weight *
|
|
est.min_weight_fraction_leaf), 5)), (
|
|
"Failed with {0} min_weight_fraction_leaf={1}, "
|
|
"min_samples_leaf={2}".format(
|
|
name, est.min_weight_fraction_leaf,
|
|
est.min_samples_leaf))
|
|
for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)):
|
|
# test float min_samples_leaf
|
|
est = TreeEstimator(min_weight_fraction_leaf=frac,
|
|
max_leaf_nodes=max_leaf_nodes,
|
|
min_samples_leaf=.1,
|
|
random_state=0)
|
|
est.fit(X, y)
|
|
|
|
if sparse:
|
|
out = est.tree_.apply(X.tocsr())
|
|
else:
|
|
out = est.tree_.apply(X)
|
|
|
|
node_weights = np.bincount(out)
|
|
# drop inner nodes
|
|
leaf_weights = node_weights[node_weights != 0]
|
|
assert (
|
|
np.min(leaf_weights) >=
|
|
max((total_weight * est.min_weight_fraction_leaf),
|
|
(total_weight * est.min_samples_leaf))), (
|
|
"Failed with {0} min_weight_fraction_leaf={1}, "
|
|
"min_samples_leaf={2}".format(name,
|
|
est.min_weight_fraction_leaf,
|
|
est.min_samples_leaf))
|
|
|
|
|
|
@pytest.mark.parametrize("name", ALL_TREES)
|
|
def test_min_weight_fraction_leaf_with_min_samples_leaf_on_dense_input(name):
|
|
check_min_weight_fraction_leaf_with_min_samples_leaf(name, "iris")
|
|
|
|
|
|
@pytest.mark.parametrize("name", SPARSE_TREES)
|
|
def test_min_weight_fraction_leaf_with_min_samples_leaf_on_sparse_input(name):
|
|
check_min_weight_fraction_leaf_with_min_samples_leaf(
|
|
name, "multilabel", True)
|
|
|
|
|
|
def test_min_impurity_split():
|
|
# test if min_impurity_split creates leaves with impurity
|
|
# [0, min_impurity_split) when min_samples_leaf = 1 and
|
|
# min_samples_split = 2.
|
|
X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
|
|
y = iris.target
|
|
|
|
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
|
|
# by setting max_leaf_nodes
|
|
for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
|
|
TreeEstimator = ALL_TREES[name]
|
|
min_impurity_split = .5
|
|
|
|
# verify leaf nodes without min_impurity_split less than
|
|
# impurity 1e-7
|
|
est = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
|
|
random_state=0)
|
|
assert est.min_impurity_split is None, (
|
|
"Failed, min_impurity_split = {0} != None".format(
|
|
est.min_impurity_split))
|
|
try:
|
|
assert_warns(FutureWarning, est.fit, X, y)
|
|
except AssertionError:
|
|
pass
|
|
for node in range(est.tree_.node_count):
|
|
if (est.tree_.children_left[node] == TREE_LEAF or
|
|
est.tree_.children_right[node] == TREE_LEAF):
|
|
assert est.tree_.impurity[node] == 0., (
|
|
"Failed with {0} min_impurity_split={1}".format(
|
|
est.tree_.impurity[node],
|
|
est.min_impurity_split))
|
|
|
|
# verify leaf nodes have impurity [0,min_impurity_split] when using
|
|
# min_impurity_split
|
|
est = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
|
|
min_impurity_split=min_impurity_split,
|
|
random_state=0)
|
|
assert_warns_message(FutureWarning,
|
|
"Use the min_impurity_decrease",
|
|
est.fit, X, y)
|
|
for node in range(est.tree_.node_count):
|
|
if (est.tree_.children_left[node] == TREE_LEAF or
|
|
est.tree_.children_right[node] == TREE_LEAF):
|
|
assert est.tree_.impurity[node] >= 0, (
|
|
"Failed with {0}, min_impurity_split={1}".format(
|
|
est.tree_.impurity[node],
|
|
est.min_impurity_split))
|
|
assert est.tree_.impurity[node] <= min_impurity_split, (
|
|
"Failed with {0}, min_impurity_split={1}".format(
|
|
est.tree_.impurity[node],
|
|
est.min_impurity_split))
|
|
|
|
|
|
def test_min_impurity_decrease():
|
|
# test if min_impurity_decrease ensure that a split is made only if
|
|
# if the impurity decrease is atleast that value
|
|
X, y = datasets.make_classification(n_samples=10000, random_state=42)
|
|
|
|
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
|
|
# by setting max_leaf_nodes
|
|
for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
|
|
TreeEstimator = ALL_TREES[name]
|
|
|
|
# Check default value of min_impurity_decrease, 1e-7
|
|
est1 = TreeEstimator(max_leaf_nodes=max_leaf_nodes, random_state=0)
|
|
# Check with explicit value of 0.05
|
|
est2 = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
|
|
min_impurity_decrease=0.05, random_state=0)
|
|
# Check with a much lower value of 0.0001
|
|
est3 = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
|
|
min_impurity_decrease=0.0001, random_state=0)
|
|
# Check with a much lower value of 0.1
|
|
est4 = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
|
|
min_impurity_decrease=0.1, random_state=0)
|
|
|
|
for est, expected_decrease in ((est1, 1e-7), (est2, 0.05),
|
|
(est3, 0.0001), (est4, 0.1)):
|
|
assert est.min_impurity_decrease <= expected_decrease, (
|
|
"Failed, min_impurity_decrease = {0} > {1}".format(
|
|
est.min_impurity_decrease,
|
|
expected_decrease))
|
|
est.fit(X, y)
|
|
for node in range(est.tree_.node_count):
|
|
# If current node is a not leaf node, check if the split was
|
|
# justified w.r.t the min_impurity_decrease
|
|
if est.tree_.children_left[node] != TREE_LEAF:
|
|
imp_parent = est.tree_.impurity[node]
|
|
wtd_n_node = est.tree_.weighted_n_node_samples[node]
|
|
|
|
left = est.tree_.children_left[node]
|
|
wtd_n_left = est.tree_.weighted_n_node_samples[left]
|
|
imp_left = est.tree_.impurity[left]
|
|
wtd_imp_left = wtd_n_left * imp_left
|
|
|
|
right = est.tree_.children_right[node]
|
|
wtd_n_right = est.tree_.weighted_n_node_samples[right]
|
|
imp_right = est.tree_.impurity[right]
|
|
wtd_imp_right = wtd_n_right * imp_right
|
|
|
|
wtd_avg_left_right_imp = wtd_imp_right + wtd_imp_left
|
|
wtd_avg_left_right_imp /= wtd_n_node
|
|
|
|
fractional_node_weight = (
|
|
est.tree_.weighted_n_node_samples[node] / X.shape[0])
|
|
|
|
actual_decrease = fractional_node_weight * (
|
|
imp_parent - wtd_avg_left_right_imp)
|
|
|
|
assert actual_decrease >= expected_decrease, (
|
|
"Failed with {0} expected min_impurity_decrease={1}"
|
|
.format(actual_decrease,
|
|
expected_decrease))
|
|
|
|
for name, TreeEstimator in ALL_TREES.items():
|
|
if "Classifier" in name:
|
|
X, y = iris.data, iris.target
|
|
else:
|
|
X, y = boston.data, boston.target
|
|
|
|
est = TreeEstimator(random_state=0)
|
|
est.fit(X, y)
|
|
score = est.score(X, y)
|
|
fitted_attribute = dict()
|
|
for attribute in ["max_depth", "node_count", "capacity"]:
|
|
fitted_attribute[attribute] = getattr(est.tree_, attribute)
|
|
|
|
serialized_object = pickle.dumps(est)
|
|
est2 = pickle.loads(serialized_object)
|
|
assert type(est2) == est.__class__
|
|
score2 = est2.score(X, y)
|
|
assert score == score2, (
|
|
"Failed to generate same score after pickling "
|
|
"with {0}".format(name))
|
|
|
|
for attribute in fitted_attribute:
|
|
assert (getattr(est2.tree_, attribute) ==
|
|
fitted_attribute[attribute]), (
|
|
"Failed to generate same attribute {0} after "
|
|
"pickling with {1}".format(attribute, name))
|
|
|
|
|
|
def test_multioutput():
|
|
# Check estimators on multi-output problems.
|
|
X = [[-2, -1],
|
|
[-1, -1],
|
|
[-1, -2],
|
|
[1, 1],
|
|
[1, 2],
|
|
[2, 1],
|
|
[-2, 1],
|
|
[-1, 1],
|
|
[-1, 2],
|
|
[2, -1],
|
|
[1, -1],
|
|
[1, -2]]
|
|
|
|
y = [[-1, 0],
|
|
[-1, 0],
|
|
[-1, 0],
|
|
[1, 1],
|
|
[1, 1],
|
|
[1, 1],
|
|
[-1, 2],
|
|
[-1, 2],
|
|
[-1, 2],
|
|
[1, 3],
|
|
[1, 3],
|
|
[1, 3]]
|
|
|
|
T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
|
|
y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
|
|
|
|
# toy classification problem
|
|
for name, TreeClassifier in CLF_TREES.items():
|
|
clf = TreeClassifier(random_state=0)
|
|
y_hat = clf.fit(X, y).predict(T)
|
|
assert_array_equal(y_hat, y_true)
|
|
assert y_hat.shape == (4, 2)
|
|
|
|
proba = clf.predict_proba(T)
|
|
assert len(proba) == 2
|
|
assert proba[0].shape == (4, 2)
|
|
assert proba[1].shape == (4, 4)
|
|
|
|
log_proba = clf.predict_log_proba(T)
|
|
assert len(log_proba) == 2
|
|
assert log_proba[0].shape == (4, 2)
|
|
assert log_proba[1].shape == (4, 4)
|
|
|
|
# toy regression problem
|
|
for name, TreeRegressor in REG_TREES.items():
|
|
reg = TreeRegressor(random_state=0)
|
|
y_hat = reg.fit(X, y).predict(T)
|
|
assert_almost_equal(y_hat, y_true)
|
|
assert y_hat.shape == (4, 2)
|
|
|
|
|
|
def test_classes_shape():
|
|
# Test that n_classes_ and classes_ have proper shape.
|
|
for name, TreeClassifier in CLF_TREES.items():
|
|
# Classification, single output
|
|
clf = TreeClassifier(random_state=0)
|
|
clf.fit(X, y)
|
|
|
|
assert clf.n_classes_ == 2
|
|
assert_array_equal(clf.classes_, [-1, 1])
|
|
|
|
# Classification, multi-output
|
|
_y = np.vstack((y, np.array(y) * 2)).T
|
|
clf = TreeClassifier(random_state=0)
|
|
clf.fit(X, _y)
|
|
assert len(clf.n_classes_) == 2
|
|
assert len(clf.classes_) == 2
|
|
assert_array_equal(clf.n_classes_, [2, 2])
|
|
assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]])
|
|
|
|
|
|
def test_unbalanced_iris():
|
|
# Check class rebalancing.
|
|
unbalanced_X = iris.data[:125]
|
|
unbalanced_y = iris.target[:125]
|
|
sample_weight = compute_sample_weight("balanced", unbalanced_y)
|
|
|
|
for name, TreeClassifier in CLF_TREES.items():
|
|
clf = TreeClassifier(random_state=0)
|
|
clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
|
|
assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
|
|
|
|
|
|
def test_memory_layout():
|
|
# Check that it works no matter the memory layout
|
|
for (name, TreeEstimator), dtype in product(ALL_TREES.items(),
|
|
[np.float64, np.float32]):
|
|
est = TreeEstimator(random_state=0)
|
|
|
|
# Nothing
|
|
X = np.asarray(iris.data, dtype=dtype)
|
|
y = iris.target
|
|
assert_array_equal(est.fit(X, y).predict(X), y)
|
|
|
|
# C-order
|
|
X = np.asarray(iris.data, order="C", dtype=dtype)
|
|
y = iris.target
|
|
assert_array_equal(est.fit(X, y).predict(X), y)
|
|
|
|
# F-order
|
|
X = np.asarray(iris.data, order="F", dtype=dtype)
|
|
y = iris.target
|
|
assert_array_equal(est.fit(X, y).predict(X), y)
|
|
|
|
# Contiguous
|
|
X = np.ascontiguousarray(iris.data, dtype=dtype)
|
|
y = iris.target
|
|
assert_array_equal(est.fit(X, y).predict(X), y)
|
|
|
|
# csr matrix
|
|
X = csr_matrix(iris.data, dtype=dtype)
|
|
y = iris.target
|
|
assert_array_equal(est.fit(X, y).predict(X), y)
|
|
|
|
# csc_matrix
|
|
X = csc_matrix(iris.data, dtype=dtype)
|
|
y = iris.target
|
|
assert_array_equal(est.fit(X, y).predict(X), y)
|
|
|
|
# Strided
|
|
X = np.asarray(iris.data[::3], dtype=dtype)
|
|
y = iris.target[::3]
|
|
assert_array_equal(est.fit(X, y).predict(X), y)
|
|
|
|
|
|
def test_sample_weight():
|
|
# Check sample weighting.
|
|
# Test that zero-weighted samples are not taken into account
|
|
X = np.arange(100)[:, np.newaxis]
|
|
y = np.ones(100)
|
|
y[:50] = 0.0
|
|
|
|
sample_weight = np.ones(100)
|
|
sample_weight[y == 0] = 0.0
|
|
|
|
clf = DecisionTreeClassifier(random_state=0)
|
|
clf.fit(X, y, sample_weight=sample_weight)
|
|
assert_array_equal(clf.predict(X), np.ones(100))
|
|
|
|
# Test that low weighted samples are not taken into account at low depth
|
|
X = np.arange(200)[:, np.newaxis]
|
|
y = np.zeros(200)
|
|
y[50:100] = 1
|
|
y[100:200] = 2
|
|
X[100:200, 0] = 200
|
|
|
|
sample_weight = np.ones(200)
|
|
|
|
sample_weight[y == 2] = .51 # Samples of class '2' are still weightier
|
|
clf = DecisionTreeClassifier(max_depth=1, random_state=0)
|
|
clf.fit(X, y, sample_weight=sample_weight)
|
|
assert clf.tree_.threshold[0] == 149.5
|
|
|
|
sample_weight[y == 2] = .5 # Samples of class '2' are no longer weightier
|
|
clf = DecisionTreeClassifier(max_depth=1, random_state=0)
|
|
clf.fit(X, y, sample_weight=sample_weight)
|
|
assert clf.tree_.threshold[0] == 49.5 # Threshold should have moved
|
|
|
|
# Test that sample weighting is the same as having duplicates
|
|
X = iris.data
|
|
y = iris.target
|
|
|
|
duplicates = rng.randint(0, X.shape[0], 100)
|
|
|
|
clf = DecisionTreeClassifier(random_state=1)
|
|
clf.fit(X[duplicates], y[duplicates])
|
|
|
|
sample_weight = np.bincount(duplicates, minlength=X.shape[0])
|
|
clf2 = DecisionTreeClassifier(random_state=1)
|
|
clf2.fit(X, y, sample_weight=sample_weight)
|
|
|
|
internal = clf.tree_.children_left != tree._tree.TREE_LEAF
|
|
assert_array_almost_equal(clf.tree_.threshold[internal],
|
|
clf2.tree_.threshold[internal])
|
|
|
|
|
|
def test_sample_weight_invalid():
|
|
# Check sample weighting raises errors.
|
|
X = np.arange(100)[:, np.newaxis]
|
|
y = np.ones(100)
|
|
y[:50] = 0.0
|
|
|
|
clf = DecisionTreeClassifier(random_state=0)
|
|
|
|
sample_weight = np.random.rand(100, 1)
|
|
with pytest.raises(ValueError):
|
|
clf.fit(X, y, sample_weight=sample_weight)
|
|
|
|
sample_weight = np.array(0)
|
|
expected_err = r"Singleton.* cannot be considered a valid collection"
|
|
with pytest.raises(TypeError, match=expected_err):
|
|
clf.fit(X, y, sample_weight=sample_weight)
|
|
|
|
|
|
def check_class_weights(name):
|
|
"""Check class_weights resemble sample_weights behavior."""
|
|
TreeClassifier = CLF_TREES[name]
|
|
|
|
# Iris is balanced, so no effect expected for using 'balanced' weights
|
|
clf1 = TreeClassifier(random_state=0)
|
|
clf1.fit(iris.data, iris.target)
|
|
clf2 = TreeClassifier(class_weight='balanced', random_state=0)
|
|
clf2.fit(iris.data, iris.target)
|
|
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
|
|
|
|
# Make a multi-output problem with three copies of Iris
|
|
iris_multi = np.vstack((iris.target, iris.target, iris.target)).T
|
|
# Create user-defined weights that should balance over the outputs
|
|
clf3 = TreeClassifier(class_weight=[{0: 2., 1: 2., 2: 1.},
|
|
{0: 2., 1: 1., 2: 2.},
|
|
{0: 1., 1: 2., 2: 2.}],
|
|
random_state=0)
|
|
clf3.fit(iris.data, iris_multi)
|
|
assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
|
|
# Check against multi-output "auto" which should also have no effect
|
|
clf4 = TreeClassifier(class_weight='balanced', random_state=0)
|
|
clf4.fit(iris.data, iris_multi)
|
|
assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
|
|
|
|
# Inflate importance of class 1, check against user-defined weights
|
|
sample_weight = np.ones(iris.target.shape)
|
|
sample_weight[iris.target == 1] *= 100
|
|
class_weight = {0: 1., 1: 100., 2: 1.}
|
|
clf1 = TreeClassifier(random_state=0)
|
|
clf1.fit(iris.data, iris.target, sample_weight)
|
|
clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
|
|
clf2.fit(iris.data, iris.target)
|
|
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
|
|
|
|
# Check that sample_weight and class_weight are multiplicative
|
|
clf1 = TreeClassifier(random_state=0)
|
|
clf1.fit(iris.data, iris.target, sample_weight ** 2)
|
|
clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
|
|
clf2.fit(iris.data, iris.target, sample_weight)
|
|
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
|
|
|
|
|
|
@pytest.mark.parametrize("name", CLF_TREES)
|
|
def test_class_weights(name):
|
|
check_class_weights(name)
|
|
|
|
|
|
def check_class_weight_errors(name):
|
|
# Test if class_weight raises errors and warnings when expected.
|
|
TreeClassifier = CLF_TREES[name]
|
|
_y = np.vstack((y, np.array(y) * 2)).T
|
|
|
|
# Invalid preset string
|
|
clf = TreeClassifier(class_weight='the larch', random_state=0)
|
|
with pytest.raises(ValueError):
|
|
clf.fit(X, y)
|
|
with pytest.raises(ValueError):
|
|
clf.fit(X, _y)
|
|
|
|
# Not a list or preset for multi-output
|
|
clf = TreeClassifier(class_weight=1, random_state=0)
|
|
with pytest.raises(ValueError):
|
|
clf.fit(X, _y)
|
|
|
|
# Incorrect length list for multi-output
|
|
clf = TreeClassifier(class_weight=[{-1: 0.5, 1: 1.}], random_state=0)
|
|
with pytest.raises(ValueError):
|
|
clf.fit(X, _y)
|
|
|
|
|
|
@pytest.mark.parametrize("name", CLF_TREES)
|
|
def test_class_weight_errors(name):
|
|
check_class_weight_errors(name)
|
|
|
|
|
|
def test_max_leaf_nodes():
|
|
# Test greedy trees with max_depth + 1 leafs.
|
|
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
|
|
k = 4
|
|
for name, TreeEstimator in ALL_TREES.items():
|
|
est = TreeEstimator(max_depth=None, max_leaf_nodes=k + 1).fit(X, y)
|
|
assert est.get_n_leaves() == k + 1
|
|
|
|
# max_leaf_nodes in (0, 1) should raise ValueError
|
|
est = TreeEstimator(max_depth=None, max_leaf_nodes=0)
|
|
with pytest.raises(ValueError):
|
|
est.fit(X, y)
|
|
est = TreeEstimator(max_depth=None, max_leaf_nodes=1)
|
|
with pytest.raises(ValueError):
|
|
est.fit(X, y)
|
|
est = TreeEstimator(max_depth=None, max_leaf_nodes=0.1)
|
|
with pytest.raises(ValueError):
|
|
est.fit(X, y)
|
|
|
|
|
|
def test_max_leaf_nodes_max_depth():
|
|
# Test precedence of max_leaf_nodes over max_depth.
|
|
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
|
|
k = 4
|
|
for name, TreeEstimator in ALL_TREES.items():
|
|
est = TreeEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
|
|
assert est.get_depth() == 1
|
|
|
|
|
|
def test_arrays_persist():
|
|
# Ensure property arrays' memory stays alive when tree disappears
|
|
# non-regression for #2726
|
|
for attr in ['n_classes', 'value', 'children_left', 'children_right',
|
|
'threshold', 'impurity', 'feature', 'n_node_samples']:
|
|
value = getattr(DecisionTreeClassifier().fit([[0], [1]],
|
|
[0, 1]).tree_, attr)
|
|
# if pointing to freed memory, contents may be arbitrary
|
|
assert -3 <= value.flat[0] < 3, \
|
|
'Array points to arbitrary memory'
|
|
|
|
|
|
def test_only_constant_features():
|
|
random_state = check_random_state(0)
|
|
X = np.zeros((10, 20))
|
|
y = random_state.randint(0, 2, (10, ))
|
|
for name, TreeEstimator in ALL_TREES.items():
|
|
est = TreeEstimator(random_state=0)
|
|
est.fit(X, y)
|
|
assert est.tree_.max_depth == 0
|
|
|
|
|
|
def test_behaviour_constant_feature_after_splits():
|
|
X = np.transpose(np.vstack(([[0, 0, 0, 0, 0, 1, 2, 4, 5, 6, 7]],
|
|
np.zeros((4, 11)))))
|
|
y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]
|
|
for name, TreeEstimator in ALL_TREES.items():
|
|
# do not check extra random trees
|
|
if "ExtraTree" not in name:
|
|
est = TreeEstimator(random_state=0, max_features=1)
|
|
est.fit(X, y)
|
|
assert est.tree_.max_depth == 2
|
|
assert est.tree_.node_count == 5
|
|
|
|
|
|
def test_with_only_one_non_constant_features():
|
|
X = np.hstack([np.array([[1.], [1.], [0.], [0.]]),
|
|
np.zeros((4, 1000))])
|
|
|
|
y = np.array([0., 1., 0., 1.0])
|
|
for name, TreeEstimator in CLF_TREES.items():
|
|
est = TreeEstimator(random_state=0, max_features=1)
|
|
est.fit(X, y)
|
|
assert est.tree_.max_depth == 1
|
|
assert_array_equal(est.predict_proba(X), np.full((4, 2), 0.5))
|
|
|
|
for name, TreeEstimator in REG_TREES.items():
|
|
est = TreeEstimator(random_state=0, max_features=1)
|
|
est.fit(X, y)
|
|
assert est.tree_.max_depth == 1
|
|
assert_array_equal(est.predict(X), np.full((4, ), 0.5))
|
|
|
|
|
|
def test_big_input():
|
|
# Test if the warning for too large inputs is appropriate.
|
|
X = np.repeat(10 ** 40., 4).astype(np.float64).reshape(-1, 1)
|
|
clf = DecisionTreeClassifier()
|
|
try:
|
|
clf.fit(X, [0, 1, 0, 1])
|
|
except ValueError as e:
|
|
assert "float32" in str(e)
|
|
|
|
|
|
def test_realloc():
|
|
from sklearn.tree._utils import _realloc_test
|
|
with pytest.raises(MemoryError):
|
|
_realloc_test()
|
|
|
|
|
|
def test_huge_allocations():
|
|
n_bits = 8 * struct.calcsize("P")
|
|
|
|
X = np.random.randn(10, 2)
|
|
y = np.random.randint(0, 2, 10)
|
|
|
|
# Sanity check: we cannot request more memory than the size of the address
|
|
# space. Currently raises OverflowError.
|
|
huge = 2 ** (n_bits + 1)
|
|
clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge)
|
|
with pytest.raises(Exception):
|
|
clf.fit(X, y)
|
|
|
|
# Non-regression test: MemoryError used to be dropped by Cython
|
|
# because of missing "except *".
|
|
huge = 2 ** (n_bits - 1) - 1
|
|
clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge)
|
|
with pytest.raises(MemoryError):
|
|
clf.fit(X, y)
|
|
|
|
|
|
def check_sparse_input(tree, dataset, max_depth=None):
|
|
TreeEstimator = ALL_TREES[tree]
|
|
X = DATASETS[dataset]["X"]
|
|
X_sparse = DATASETS[dataset]["X_sparse"]
|
|
y = DATASETS[dataset]["y"]
|
|
|
|
# Gain testing time
|
|
if dataset in ["digits", "boston"]:
|
|
n_samples = X.shape[0] // 5
|
|
X = X[:n_samples]
|
|
X_sparse = X_sparse[:n_samples]
|
|
y = y[:n_samples]
|
|
|
|
for sparse_format in (csr_matrix, csc_matrix, coo_matrix):
|
|
X_sparse = sparse_format(X_sparse)
|
|
|
|
# Check the default (depth first search)
|
|
d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
|
|
s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)
|
|
|
|
assert_tree_equal(d.tree_, s.tree_,
|
|
"{0} with dense and sparse format gave different "
|
|
"trees".format(tree))
|
|
|
|
y_pred = d.predict(X)
|
|
if tree in CLF_TREES:
|
|
y_proba = d.predict_proba(X)
|
|
y_log_proba = d.predict_log_proba(X)
|
|
|
|
for sparse_matrix in (csr_matrix, csc_matrix, coo_matrix):
|
|
X_sparse_test = sparse_matrix(X_sparse, dtype=np.float32)
|
|
|
|
assert_array_almost_equal(s.predict(X_sparse_test), y_pred)
|
|
|
|
if tree in CLF_TREES:
|
|
assert_array_almost_equal(s.predict_proba(X_sparse_test),
|
|
y_proba)
|
|
assert_array_almost_equal(s.predict_log_proba(X_sparse_test),
|
|
y_log_proba)
|
|
|
|
|
|
@pytest.mark.parametrize("tree_type", SPARSE_TREES)
|
|
@pytest.mark.parametrize(
|
|
"dataset",
|
|
("clf_small", "toy", "digits", "multilabel",
|
|
"sparse-pos", "sparse-neg", "sparse-mix",
|
|
"zeros")
|
|
)
|
|
def test_sparse_input(tree_type, dataset):
|
|
max_depth = 3 if dataset == "digits" else None
|
|
check_sparse_input(tree_type, dataset, max_depth)
|
|
|
|
|
|
@pytest.mark.parametrize("tree_type",
|
|
sorted(set(SPARSE_TREES).intersection(REG_TREES)))
|
|
@pytest.mark.parametrize("dataset", ["boston", "reg_small"])
|
|
def test_sparse_input_reg_trees(tree_type, dataset):
|
|
# Due to numerical instability of MSE and too strict test, we limit the
|
|
# maximal depth
|
|
check_sparse_input(tree_type, dataset, 2)
|
|
|
|
|
|
def check_sparse_parameters(tree, dataset):
|
|
TreeEstimator = ALL_TREES[tree]
|
|
X = DATASETS[dataset]["X"]
|
|
X_sparse = DATASETS[dataset]["X_sparse"]
|
|
y = DATASETS[dataset]["y"]
|
|
|
|
# Check max_features
|
|
d = TreeEstimator(random_state=0, max_features=1, max_depth=2).fit(X, y)
|
|
s = TreeEstimator(random_state=0, max_features=1,
|
|
max_depth=2).fit(X_sparse, y)
|
|
assert_tree_equal(d.tree_, s.tree_,
|
|
"{0} with dense and sparse format gave different "
|
|
"trees".format(tree))
|
|
assert_array_almost_equal(s.predict(X), d.predict(X))
|
|
|
|
# Check min_samples_split
|
|
d = TreeEstimator(random_state=0, max_features=1,
|
|
min_samples_split=10).fit(X, y)
|
|
s = TreeEstimator(random_state=0, max_features=1,
|
|
min_samples_split=10).fit(X_sparse, y)
|
|
assert_tree_equal(d.tree_, s.tree_,
|
|
"{0} with dense and sparse format gave different "
|
|
"trees".format(tree))
|
|
assert_array_almost_equal(s.predict(X), d.predict(X))
|
|
|
|
# Check min_samples_leaf
|
|
d = TreeEstimator(random_state=0,
|
|
min_samples_leaf=X_sparse.shape[0] // 2).fit(X, y)
|
|
s = TreeEstimator(random_state=0,
|
|
min_samples_leaf=X_sparse.shape[0] // 2).fit(X_sparse, y)
|
|
assert_tree_equal(d.tree_, s.tree_,
|
|
"{0} with dense and sparse format gave different "
|
|
"trees".format(tree))
|
|
assert_array_almost_equal(s.predict(X), d.predict(X))
|
|
|
|
# Check best-first search
|
|
d = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X, y)
|
|
s = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X_sparse, y)
|
|
assert_tree_equal(d.tree_, s.tree_,
|
|
"{0} with dense and sparse format gave different "
|
|
"trees".format(tree))
|
|
assert_array_almost_equal(s.predict(X), d.predict(X))
|
|
|
|
|
|
def check_sparse_criterion(tree, dataset):
|
|
TreeEstimator = ALL_TREES[tree]
|
|
X = DATASETS[dataset]["X"]
|
|
X_sparse = DATASETS[dataset]["X_sparse"]
|
|
y = DATASETS[dataset]["y"]
|
|
|
|
# Check various criterion
|
|
CRITERIONS = REG_CRITERIONS if tree in REG_TREES else CLF_CRITERIONS
|
|
for criterion in CRITERIONS:
|
|
d = TreeEstimator(random_state=0, max_depth=3,
|
|
criterion=criterion).fit(X, y)
|
|
s = TreeEstimator(random_state=0, max_depth=3,
|
|
criterion=criterion).fit(X_sparse, y)
|
|
|
|
assert_tree_equal(d.tree_, s.tree_,
|
|
"{0} with dense and sparse format gave different "
|
|
"trees".format(tree))
|
|
assert_array_almost_equal(s.predict(X), d.predict(X))
|
|
|
|
|
|
@pytest.mark.parametrize("tree_type", SPARSE_TREES)
|
|
@pytest.mark.parametrize("dataset",
|
|
["sparse-pos", "sparse-neg", "sparse-mix", "zeros"])
|
|
@pytest.mark.parametrize("check",
|
|
[check_sparse_parameters, check_sparse_criterion])
|
|
def test_sparse(tree_type, dataset, check):
|
|
check(tree_type, dataset)
|
|
|
|
|
|
def check_explicit_sparse_zeros(tree, max_depth=3,
|
|
n_features=10):
|
|
TreeEstimator = ALL_TREES[tree]
|
|
|
|
# n_samples set n_feature to ease construction of a simultaneous
|
|
# construction of a csr and csc matrix
|
|
n_samples = n_features
|
|
samples = np.arange(n_samples)
|
|
|
|
# Generate X, y
|
|
random_state = check_random_state(0)
|
|
indices = []
|
|
data = []
|
|
offset = 0
|
|
indptr = [offset]
|
|
for i in range(n_features):
|
|
n_nonzero_i = random_state.binomial(n_samples, 0.5)
|
|
indices_i = random_state.permutation(samples)[:n_nonzero_i]
|
|
indices.append(indices_i)
|
|
data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1
|
|
data.append(data_i)
|
|
offset += n_nonzero_i
|
|
indptr.append(offset)
|
|
|
|
indices = np.concatenate(indices)
|
|
data = np.array(np.concatenate(data), dtype=np.float32)
|
|
X_sparse = csc_matrix((data, indices, indptr),
|
|
shape=(n_samples, n_features))
|
|
X = X_sparse.toarray()
|
|
X_sparse_test = csr_matrix((data, indices, indptr),
|
|
shape=(n_samples, n_features))
|
|
X_test = X_sparse_test.toarray()
|
|
y = random_state.randint(0, 3, size=(n_samples, ))
|
|
|
|
# Ensure that X_sparse_test owns its data, indices and indptr array
|
|
X_sparse_test = X_sparse_test.copy()
|
|
|
|
# Ensure that we have explicit zeros
|
|
assert (X_sparse.data == 0.).sum() > 0
|
|
assert (X_sparse_test.data == 0.).sum() > 0
|
|
|
|
# Perform the comparison
|
|
d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
|
|
s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)
|
|
|
|
assert_tree_equal(d.tree_, s.tree_,
|
|
"{0} with dense and sparse format gave different "
|
|
"trees".format(tree))
|
|
|
|
Xs = (X_test, X_sparse_test)
|
|
for X1, X2 in product(Xs, Xs):
|
|
assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2))
|
|
assert_array_almost_equal(s.apply(X1), d.apply(X2))
|
|
assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1))
|
|
|
|
assert_array_almost_equal(s.tree_.decision_path(X1).toarray(),
|
|
d.tree_.decision_path(X2).toarray())
|
|
assert_array_almost_equal(s.decision_path(X1).toarray(),
|
|
d.decision_path(X2).toarray())
|
|
assert_array_almost_equal(s.decision_path(X1).toarray(),
|
|
s.tree_.decision_path(X1).toarray())
|
|
|
|
assert_array_almost_equal(s.predict(X1), d.predict(X2))
|
|
|
|
if tree in CLF_TREES:
|
|
assert_array_almost_equal(s.predict_proba(X1),
|
|
d.predict_proba(X2))
|
|
|
|
|
|
@pytest.mark.parametrize("tree_type", SPARSE_TREES)
|
|
def test_explicit_sparse_zeros(tree_type):
|
|
check_explicit_sparse_zeros(tree_type)
|
|
|
|
|
|
@ignore_warnings
|
|
def check_raise_error_on_1d_input(name):
|
|
TreeEstimator = ALL_TREES[name]
|
|
|
|
X = iris.data[:, 0].ravel()
|
|
X_2d = iris.data[:, 0].reshape((-1, 1))
|
|
y = iris.target
|
|
|
|
with pytest.raises(ValueError):
|
|
TreeEstimator(random_state=0).fit(X, y)
|
|
|
|
est = TreeEstimator(random_state=0)
|
|
est.fit(X_2d, y)
|
|
with pytest.raises(ValueError):
|
|
est.predict([X])
|
|
|
|
|
|
@pytest.mark.parametrize("name", ALL_TREES)
|
|
def test_1d_input(name):
|
|
with ignore_warnings():
|
|
check_raise_error_on_1d_input(name)
|
|
|
|
|
|
def _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight):
|
|
est = TreeEstimator(random_state=0)
|
|
est.fit(X, y, sample_weight=sample_weight)
|
|
assert est.tree_.max_depth == 1
|
|
|
|
est = TreeEstimator(random_state=0, min_weight_fraction_leaf=0.4)
|
|
est.fit(X, y, sample_weight=sample_weight)
|
|
assert est.tree_.max_depth == 0
|
|
|
|
|
|
def check_min_weight_leaf_split_level(name):
|
|
TreeEstimator = ALL_TREES[name]
|
|
|
|
X = np.array([[0], [0], [0], [0], [1]])
|
|
y = [0, 0, 0, 0, 1]
|
|
sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2]
|
|
_check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight)
|
|
|
|
_check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y,
|
|
sample_weight)
|
|
|
|
|
|
@pytest.mark.parametrize("name", ALL_TREES)
|
|
def test_min_weight_leaf_split_level(name):
|
|
check_min_weight_leaf_split_level(name)
|
|
|
|
|
|
def check_public_apply(name):
|
|
X_small32 = X_small.astype(tree._tree.DTYPE, copy=False)
|
|
|
|
est = ALL_TREES[name]()
|
|
est.fit(X_small, y_small)
|
|
assert_array_equal(est.apply(X_small),
|
|
est.tree_.apply(X_small32))
|
|
|
|
|
|
def check_public_apply_sparse(name):
|
|
X_small32 = csr_matrix(X_small.astype(tree._tree.DTYPE, copy=False))
|
|
|
|
est = ALL_TREES[name]()
|
|
est.fit(X_small, y_small)
|
|
assert_array_equal(est.apply(X_small),
|
|
est.tree_.apply(X_small32))
|
|
|
|
|
|
@pytest.mark.parametrize("name", ALL_TREES)
|
|
def test_public_apply_all_trees(name):
|
|
check_public_apply(name)
|
|
|
|
|
|
@pytest.mark.parametrize("name", SPARSE_TREES)
|
|
def test_public_apply_sparse_trees(name):
|
|
check_public_apply_sparse(name)
|
|
|
|
|
|
@pytest.mark.parametrize('Cls',
|
|
(DecisionTreeRegressor, DecisionTreeClassifier))
|
|
@pytest.mark.parametrize('presort', ['auto', True, False])
|
|
def test_presort_deprecated(Cls, presort):
|
|
# TODO: remove in v0.24
|
|
X = np.zeros((10, 10))
|
|
y = np.r_[[0] * 5, [1] * 5]
|
|
tree = Cls(presort=presort)
|
|
with pytest.warns(FutureWarning,
|
|
match="The parameter 'presort' is deprecated "):
|
|
tree.fit(X, y)
|
|
|
|
|
|
def test_decision_path_hardcoded():
|
|
X = iris.data
|
|
y = iris.target
|
|
est = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y)
|
|
node_indicator = est.decision_path(X[:2]).toarray()
|
|
assert_array_equal(node_indicator, [[1, 1, 0], [1, 0, 1]])
|
|
|
|
|
|
def check_decision_path(name):
|
|
X = iris.data
|
|
y = iris.target
|
|
n_samples = X.shape[0]
|
|
|
|
TreeEstimator = ALL_TREES[name]
|
|
est = TreeEstimator(random_state=0, max_depth=2)
|
|
est.fit(X, y)
|
|
|
|
node_indicator_csr = est.decision_path(X)
|
|
node_indicator = node_indicator_csr.toarray()
|
|
assert node_indicator.shape == (n_samples, est.tree_.node_count)
|
|
|
|
# Assert that leaves index are correct
|
|
leaves = est.apply(X)
|
|
leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)]
|
|
assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
|
|
|
|
# Ensure only one leave node per sample
|
|
all_leaves = est.tree_.children_left == TREE_LEAF
|
|
assert_array_almost_equal(np.dot(node_indicator, all_leaves),
|
|
np.ones(shape=n_samples))
|
|
|
|
# Ensure max depth is consistent with sum of indicator
|
|
max_depth = node_indicator.sum(axis=1).max()
|
|
assert est.tree_.max_depth <= max_depth
|
|
|
|
|
|
@pytest.mark.parametrize("name", ALL_TREES)
|
|
def test_decision_path(name):
|
|
check_decision_path(name)
|
|
|
|
|
|
def check_no_sparse_y_support(name):
|
|
X, y = X_multilabel, csr_matrix(y_multilabel)
|
|
TreeEstimator = ALL_TREES[name]
|
|
with pytest.raises(TypeError):
|
|
TreeEstimator(random_state=0).fit(X, y)
|
|
|
|
|
|
@pytest.mark.parametrize("name", ALL_TREES)
|
|
def test_no_sparse_y_support(name):
|
|
# Currently we don't support sparse y
|
|
check_no_sparse_y_support(name)
|
|
|
|
|
|
def test_mae():
|
|
"""Check MAE criterion produces correct results on small toy dataset:
|
|
|
|
------------------
|
|
| X | y | weight |
|
|
------------------
|
|
| 3 | 3 | 0.1 |
|
|
| 5 | 3 | 0.3 |
|
|
| 8 | 4 | 1.0 |
|
|
| 3 | 6 | 0.6 |
|
|
| 5 | 7 | 0.3 |
|
|
------------------
|
|
|sum wt:| 2.3 |
|
|
------------------
|
|
|
|
Because we are dealing with sample weights, we cannot find the median by
|
|
simply choosing/averaging the centre value(s), instead we consider the
|
|
median where 50% of the cumulative weight is found (in a y sorted data set)
|
|
. Therefore with regards to this test data, the cumulative weight is >= 50%
|
|
when y = 4. Therefore:
|
|
Median = 4
|
|
|
|
For all the samples, we can get the total error by summing:
|
|
Absolute(Median - y) * weight
|
|
|
|
I.e., total error = (Absolute(4 - 3) * 0.1)
|
|
+ (Absolute(4 - 3) * 0.3)
|
|
+ (Absolute(4 - 4) * 1.0)
|
|
+ (Absolute(4 - 6) * 0.6)
|
|
+ (Absolute(4 - 7) * 0.3)
|
|
= 2.5
|
|
|
|
Impurity = Total error / total weight
|
|
= 2.5 / 2.3
|
|
= 1.08695652173913
|
|
------------------
|
|
|
|
From this root node, the next best split is between X values of 3 and 5.
|
|
Thus, we have left and right child nodes:
|
|
|
|
LEFT RIGHT
|
|
------------------ ------------------
|
|
| X | y | weight | | X | y | weight |
|
|
------------------ ------------------
|
|
| 3 | 3 | 0.1 | | 5 | 3 | 0.3 |
|
|
| 3 | 6 | 0.6 | | 8 | 4 | 1.0 |
|
|
------------------ | 5 | 7 | 0.3 |
|
|
|sum wt:| 0.7 | ------------------
|
|
------------------ |sum wt:| 1.6 |
|
|
------------------
|
|
|
|
Impurity is found in the same way:
|
|
Left node Median = 6
|
|
Total error = (Absolute(6 - 3) * 0.1)
|
|
+ (Absolute(6 - 6) * 0.6)
|
|
= 0.3
|
|
|
|
Left Impurity = Total error / total weight
|
|
= 0.3 / 0.7
|
|
= 0.428571428571429
|
|
-------------------
|
|
|
|
Likewise for Right node:
|
|
Right node Median = 4
|
|
Total error = (Absolute(4 - 3) * 0.3)
|
|
+ (Absolute(4 - 4) * 1.0)
|
|
+ (Absolute(4 - 7) * 0.3)
|
|
= 1.2
|
|
|
|
Right Impurity = Total error / total weight
|
|
= 1.2 / 1.6
|
|
= 0.75
|
|
------
|
|
"""
|
|
dt_mae = DecisionTreeRegressor(random_state=0, criterion="mae",
|
|
max_leaf_nodes=2)
|
|
|
|
# Test MAE where sample weights are non-uniform (as illustrated above):
|
|
dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3],
|
|
sample_weight=[0.6, 0.3, 0.1, 1.0, 0.3])
|
|
assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6])
|
|
assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0])
|
|
|
|
# Test MAE where all sample weights are uniform:
|
|
dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3],
|
|
sample_weight=np.ones(5))
|
|
assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
|
|
assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
|
|
|
|
# Test MAE where a `sample_weight` is not explicitly provided.
|
|
# This is equivalent to providing uniform sample weights, though
|
|
# the internal logic is different:
|
|
dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3])
|
|
assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
|
|
assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
|
|
|
|
|
|
def test_criterion_copy():
|
|
# Let's check whether copy of our criterion has the same type
|
|
# and properties as original
|
|
n_outputs = 3
|
|
n_classes = np.arange(3, dtype=np.intp)
|
|
n_samples = 100
|
|
|
|
def _pickle_copy(obj):
|
|
return pickle.loads(pickle.dumps(obj))
|
|
for copy_func in [copy.copy, copy.deepcopy, _pickle_copy]:
|
|
for _, typename in CRITERIA_CLF.items():
|
|
criteria = typename(n_outputs, n_classes)
|
|
result = copy_func(criteria).__reduce__()
|
|
typename_, (n_outputs_, n_classes_), _ = result
|
|
assert typename == typename_
|
|
assert n_outputs == n_outputs_
|
|
assert_array_equal(n_classes, n_classes_)
|
|
|
|
for _, typename in CRITERIA_REG.items():
|
|
criteria = typename(n_outputs, n_samples)
|
|
result = copy_func(criteria).__reduce__()
|
|
typename_, (n_outputs_, n_samples_), _ = result
|
|
assert typename == typename_
|
|
assert n_outputs == n_outputs_
|
|
assert n_samples == n_samples_
|
|
|
|
|
|
def test_empty_leaf_infinite_threshold():
|
|
# try to make empty leaf by using near infinite value.
|
|
data = np.random.RandomState(0).randn(100, 11) * 2e38
|
|
data = np.nan_to_num(data.astype('float32'))
|
|
X_full = data[:, :-1]
|
|
X_sparse = csc_matrix(X_full)
|
|
y = data[:, -1]
|
|
for X in [X_full, X_sparse]:
|
|
tree = DecisionTreeRegressor(random_state=0).fit(X, y)
|
|
terminal_regions = tree.apply(X)
|
|
left_leaf = set(np.where(tree.tree_.children_left == TREE_LEAF)[0])
|
|
empty_leaf = left_leaf.difference(terminal_regions)
|
|
infinite_threshold = np.where(~np.isfinite(tree.tree_.threshold))[0]
|
|
assert len(infinite_threshold) == 0
|
|
assert len(empty_leaf) == 0
|
|
|
|
|
|
@pytest.mark.parametrize("criterion", CLF_CRITERIONS)
|
|
@pytest.mark.parametrize(
|
|
"dataset", sorted(set(DATASETS.keys()) - {"reg_small", "boston"}))
|
|
@pytest.mark.parametrize(
|
|
"tree_cls", [DecisionTreeClassifier, ExtraTreeClassifier])
|
|
def test_prune_tree_classifier_are_subtrees(criterion, dataset, tree_cls):
|
|
dataset = DATASETS[dataset]
|
|
X, y = dataset["X"], dataset["y"]
|
|
est = tree_cls(max_leaf_nodes=20, random_state=0)
|
|
info = est.cost_complexity_pruning_path(X, y)
|
|
|
|
pruning_path = info.ccp_alphas
|
|
impurities = info.impurities
|
|
assert np.all(np.diff(pruning_path) >= 0)
|
|
assert np.all(np.diff(impurities) >= 0)
|
|
|
|
assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)
|
|
|
|
|
|
@pytest.mark.parametrize("criterion", REG_CRITERIONS)
|
|
@pytest.mark.parametrize("dataset", DATASETS.keys())
|
|
@pytest.mark.parametrize(
|
|
"tree_cls", [DecisionTreeRegressor, ExtraTreeRegressor])
|
|
def test_prune_tree_regression_are_subtrees(criterion, dataset, tree_cls):
|
|
dataset = DATASETS[dataset]
|
|
X, y = dataset["X"], dataset["y"]
|
|
|
|
est = tree_cls(max_leaf_nodes=20, random_state=0)
|
|
info = est.cost_complexity_pruning_path(X, y)
|
|
|
|
pruning_path = info.ccp_alphas
|
|
impurities = info.impurities
|
|
assert np.all(np.diff(pruning_path) >= 0)
|
|
assert np.all(np.diff(impurities) >= 0)
|
|
|
|
assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)
|
|
|
|
|
|
def test_prune_single_node_tree():
|
|
# single node tree
|
|
clf1 = DecisionTreeClassifier(random_state=0)
|
|
clf1.fit([[0], [1]], [0, 0])
|
|
|
|
# pruned single node tree
|
|
clf2 = DecisionTreeClassifier(random_state=0, ccp_alpha=10)
|
|
clf2.fit([[0], [1]], [0, 0])
|
|
|
|
assert_is_subtree(clf1.tree_, clf2.tree_)
|
|
|
|
|
|
def assert_pruning_creates_subtree(estimator_cls, X, y, pruning_path):
|
|
# generate trees with increasing alphas
|
|
estimators = []
|
|
for ccp_alpha in pruning_path:
|
|
est = estimator_cls(
|
|
max_leaf_nodes=20, ccp_alpha=ccp_alpha, random_state=0).fit(X, y)
|
|
estimators.append(est)
|
|
|
|
# A pruned tree must be a subtree of the previous tree (which had a
|
|
# smaller ccp_alpha)
|
|
for prev_est, next_est in zip(estimators, estimators[1:]):
|
|
assert_is_subtree(prev_est.tree_, next_est.tree_)
|
|
|
|
|
|
def assert_is_subtree(tree, subtree):
|
|
assert tree.node_count >= subtree.node_count
|
|
assert tree.max_depth >= subtree.max_depth
|
|
|
|
tree_c_left = tree.children_left
|
|
tree_c_right = tree.children_right
|
|
subtree_c_left = subtree.children_left
|
|
subtree_c_right = subtree.children_right
|
|
|
|
stack = [(0, 0)]
|
|
while stack:
|
|
tree_node_idx, subtree_node_idx = stack.pop()
|
|
assert_array_almost_equal(tree.value[tree_node_idx],
|
|
subtree.value[subtree_node_idx])
|
|
assert_almost_equal(tree.impurity[tree_node_idx],
|
|
subtree.impurity[subtree_node_idx])
|
|
assert_almost_equal(tree.n_node_samples[tree_node_idx],
|
|
subtree.n_node_samples[subtree_node_idx])
|
|
assert_almost_equal(tree.weighted_n_node_samples[tree_node_idx],
|
|
subtree.weighted_n_node_samples[subtree_node_idx])
|
|
|
|
if (subtree_c_left[subtree_node_idx] ==
|
|
subtree_c_right[subtree_node_idx]):
|
|
# is a leaf
|
|
assert_almost_equal(TREE_UNDEFINED,
|
|
subtree.threshold[subtree_node_idx])
|
|
else:
|
|
# not a leaf
|
|
assert_almost_equal(tree.threshold[tree_node_idx],
|
|
subtree.threshold[subtree_node_idx])
|
|
stack.append((tree_c_left[tree_node_idx],
|
|
subtree_c_left[subtree_node_idx]))
|
|
stack.append((tree_c_right[tree_node_idx],
|
|
subtree_c_right[subtree_node_idx]))
|
|
|
|
|
|
def test_prune_tree_raises_negative_ccp_alpha():
|
|
clf = DecisionTreeClassifier()
|
|
msg = "ccp_alpha must be greater than or equal to 0"
|
|
|
|
with pytest.raises(ValueError, match=msg):
|
|
clf.set_params(ccp_alpha=-1.0)
|
|
clf.fit(X, y)
|
|
|
|
clf.set_params(ccp_alpha=0.0)
|
|
clf.fit(X, y)
|
|
|
|
with pytest.raises(ValueError, match=msg):
|
|
clf.set_params(ccp_alpha=-1.0)
|
|
clf._prune_tree()
|
|
|
|
|
|
def test_classes_deprecated():
|
|
X = [[0, 0], [2, 2], [4, 6], [10, 11]]
|
|
y = [0.5, 2.5, 3.5, 5.5]
|
|
clf = DecisionTreeRegressor()
|
|
clf = clf.fit(X, y)
|
|
|
|
match = ("attribute is to be deprecated from version "
|
|
"0.22 and will be removed in 0.24.")
|
|
|
|
with pytest.warns(FutureWarning, match=match):
|
|
n = len(clf.classes_)
|
|
assert n == clf.n_outputs_
|
|
|
|
with pytest.warns(FutureWarning, match=match):
|
|
assert len(clf.n_classes_) == clf.n_outputs_
|
|
|
|
|
|
def check_apply_path_readonly(name):
|
|
X_readonly = create_memmap_backed_data(X_small.astype(tree._tree.DTYPE,
|
|
copy=False))
|
|
y_readonly = create_memmap_backed_data(np.array(y_small,
|
|
dtype=tree._tree.DTYPE))
|
|
est = ALL_TREES[name]()
|
|
est.fit(X_readonly, y_readonly)
|
|
assert_array_equal(est.predict(X_readonly),
|
|
est.predict(X_small))
|
|
assert_array_equal(est.decision_path(X_readonly).todense(),
|
|
est.decision_path(X_small).todense())
|
|
|
|
|
|
@pytest.mark.parametrize("name", ALL_TREES)
|
|
def test_apply_path_readonly_all_trees(name):
|
|
check_apply_path_readonly(name)
|