Using probly with scikit-learn¶

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from probly.evaluation.tasks import selective_prediction
from probly.quantification.classification import total_entropy

Load the covertype dataset¶

data = fetch_covtype()
X, y = data.data, data.target
y = y - 1  # make the labels 0-indexed

Fit a random forest¶

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# We set the max_depth to prevent obtaining only degenerate distributions.
rf = RandomForestClassifier(n_estimators=20, max_depth=10)
rf.fit(X_train, y_train)
probs = [tree.predict_proba(X_test) for tree in rf.estimators_]
# We need to change the order of the axes to match probly's convention of [n_instances, n_samples, n_classes]
probs = np.array(probs).swapaxes(0, 1)

Make an accuracy-rejection curve using total uncertainty¶

tu = total_entropy(probs)
accuracies = (probs.mean(axis=1).argmax(axis=1) == y_test).astype(int)
auc, bins = selective_prediction(tu, accuracies)

plt.plot(np.linspace(0, 1, len(bins)), bins)
plt.xlabel("Rejected Instances")
plt.ylabel("Accuracy")
plt.title(f"Accuracy-Rejection Curve, AUC: {auc:.3f}")
plt.show()

../../_images/a755a423570fdd4fb17a6e01c70f6ad8ab8f826af09ca6335c2ec20299cc8c2a.png