Reformat source code with Black; add pre-commit hooks and CI checks

This commit is contained in:
2020-12-05 10:59:33 -06:00
parent 3823931382
commit d99600f101
49 changed files with 1291 additions and 972 deletions

View File

@@ -22,9 +22,11 @@ class AdaptiveClassifier(Classifier):
based on its cross-validation score on a particular training data set.
"""
def __init__(self,
candidates=None,
evaluator=ClassifierEvaluator()):
def __init__(
self,
candidates=None,
evaluator=ClassifierEvaluator(),
):
"""
Initializes the meta-classifier.
"""
@@ -35,14 +37,13 @@ class AdaptiveClassifier(Classifier):
"min samples": 100,
},
"logistic": {
"classifier": make_pipeline(StandardScaler(),
LogisticRegression()),
"classifier": make_pipeline(StandardScaler(), LogisticRegression()),
"min samples": 30,
},
"counting": {
"classifier": CountingClassifier(),
"min samples": 0,
}
},
}
self.candidates = candidates
self.evaluator = evaluator

View File

@@ -21,8 +21,7 @@ class CountingClassifier(Classifier):
self.mean = np.mean(y_train)
def predict_proba(self, x_test):
return np.array([[1 - self.mean, self.mean]
for _ in range(x_test.shape[0])])
return np.array([[1 - self.mean, self.mean] for _ in range(x_test.shape[0])])
def __repr__(self):
return "CountingClassifier(mean=%s)" % self.mean

View File

@@ -11,6 +11,7 @@ from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import logging
logger = logging.getLogger(__name__)
@@ -28,12 +29,14 @@ class CrossValidatedClassifier(Classifier):
acceptable. Other numbers are a linear interpolation of these two extremes.
"""
def __init__(self,
classifier=LogisticRegression(),
threshold=0.75,
constant=0.0,
cv=5,
scoring='accuracy'):
def __init__(
self,
classifier=LogisticRegression(),
threshold=0.75,
constant=0.0,
cv=5,
scoring="accuracy",
):
self.classifier = None
self.classifier_prototype = classifier
self.constant = constant
@@ -45,24 +48,36 @@ class CrossValidatedClassifier(Classifier):
# Calculate dummy score and absolute score threshold
y_train_avg = np.average(y_train)
dummy_score = max(y_train_avg, 1 - y_train_avg)
absolute_threshold = 1. * self.threshold + dummy_score * (1 - self.threshold)
absolute_threshold = 1.0 * self.threshold + dummy_score * (1 - self.threshold)
# Calculate cross validation score and decide which classifier to use
clf = deepcopy(self.classifier_prototype)
cv_score = float(np.mean(cross_val_score(clf,
x_train,
y_train,
cv=self.cv,
scoring=self.scoring)))
cv_score = float(
np.mean(
cross_val_score(
clf,
x_train,
y_train,
cv=self.cv,
scoring=self.scoring,
)
)
)
if cv_score >= absolute_threshold:
logger.debug("cv_score is above threshold (%.2f >= %.2f); keeping" %
(cv_score, absolute_threshold))
logger.debug(
"cv_score is above threshold (%.2f >= %.2f); keeping"
% (cv_score, absolute_threshold)
)
self.classifier = clf
else:
logger.debug("cv_score is below threshold (%.2f < %.2f); discarding" %
(cv_score, absolute_threshold))
self.classifier = DummyClassifier(strategy="constant",
constant=self.constant)
logger.debug(
"cv_score is below threshold (%.2f < %.2f); discarding"
% (cv_score, absolute_threshold)
)
self.classifier = DummyClassifier(
strategy="constant",
constant=self.constant,
)
# Train chosen classifier
self.classifier.fit(x_train, y_train)

View File

@@ -12,7 +12,6 @@ E = 0.1
def test_counting():
clf = CountingClassifier()
clf.fit(np.zeros((8, 25)), [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
expected_proba = np.array([[0.375, 0.625],
[0.375, 0.625]])
expected_proba = np.array([[0.375, 0.625], [0.375, 0.625]])
actual_proba = clf.predict_proba(np.zeros((2, 25)))
assert norm(actual_proba - expected_proba) < E

View File

@@ -13,34 +13,36 @@ E = 0.1
def test_cv():
# Training set: label is true if point is inside a 2D circle
x_train = np.array([[x1, x2]
for x1 in range(-10, 11)
for x2 in range(-10, 11)])
x_train = np.array([[x1, x2] for x1 in range(-10, 11) for x2 in range(-10, 11)])
x_train = StandardScaler().fit_transform(x_train)
n_samples = x_train.shape[0]
y_train = np.array([1.0 if x1*x1 + x2*x2 <= 100 else 0.0
for x1 in range(-10, 11)
for x2 in range(-10, 11)])
y_train = np.array(
[
1.0 if x1 * x1 + x2 * x2 <= 100 else 0.0
for x1 in range(-10, 11)
for x2 in range(-10, 11)
]
)
# Support vector machines with linear kernels do not perform well on this
# data set, so predictor should return the given constant.
clf = CrossValidatedClassifier(classifier=SVC(probability=True,
random_state=42),
threshold=0.90,
constant=0.0,
cv=30)
clf = CrossValidatedClassifier(
classifier=SVC(probability=True, random_state=42),
threshold=0.90,
constant=0.0,
cv=30,
)
clf.fit(x_train, y_train)
assert norm(np.zeros(n_samples) - clf.predict(x_train)) < E
# Support vector machines with quadratic kernels perform almost perfectly
# on this data set, so predictor should return their prediction.
clf = CrossValidatedClassifier(classifier=SVC(probability=True,
kernel='poly',
degree=2,
random_state=42),
threshold=0.90,
cv=30)
clf = CrossValidatedClassifier(
classifier=SVC(probability=True, kernel="poly", degree=2, random_state=42),
threshold=0.90,
cv=30,
)
clf.fit(x_train, y_train)
print(y_train - clf.predict(x_train))
assert norm(y_train - clf.predict(x_train)) < E

View File

@@ -17,4 +17,3 @@ def test_evaluator():
ev = ClassifierEvaluator()
assert ev.evaluate(clf_a, x_train, y_train) == 1.0
assert ev.evaluate(clf_b, x_train, y_train) == 0.5

View File

@@ -11,12 +11,16 @@ from miplearn.classifiers.threshold import MinPrecisionThreshold
def test_threshold_dynamic():
clf = Mock(spec=Classifier)
clf.predict_proba = Mock(return_value=np.array([
[0.10, 0.90],
[0.10, 0.90],
[0.20, 0.80],
[0.30, 0.70],
]))
clf.predict_proba = Mock(
return_value=np.array(
[
[0.10, 0.90],
[0.10, 0.90],
[0.20, 0.80],
[0.30, 0.70],
]
)
)
x_train = np.array([0, 1, 2, 3])
y_train = np.array([1, 1, 0, 0])
@@ -31,4 +35,3 @@ def test_threshold_dynamic():
threshold = MinPrecisionThreshold(min_precision=0.00)
assert threshold.find(clf, x_train, y_train) == 0.70

View File

@@ -30,11 +30,15 @@ class MinPrecisionThreshold(DynamicThreshold):
def find(self, clf, x_train, y_train):
proba = clf.predict_proba(x_train)
assert isinstance(proba, np.ndarray), \
"classifier should return numpy array"
assert proba.shape == (x_train.shape[0], 2), \
"classifier should return (%d,%d)-shaped array, not %s" % (
x_train.shape[0], 2, str(proba.shape))
assert isinstance(proba, np.ndarray), "classifier should return numpy array"
assert proba.shape == (
x_train.shape[0],
2,
), "classifier should return (%d,%d)-shaped array, not %s" % (
x_train.shape[0],
2,
str(proba.shape),
)
fps, tps, thresholds = _binary_clf_curve(y_train, proba[:, 1])
precision = tps / (tps + fps)