Document and simplify Classifier and Regressor

This commit is contained in:
2021-01-22 09:06:04 -06:00
parent f90d78f802
commit b87ef651e1
4 changed files with 123 additions and 18 deletions

View File

@@ -42,6 +42,7 @@ reformat:
test: test:
$(MYPY) -p miplearn $(MYPY) -p miplearn
$(MYPY) -p tests
$(PYTEST) $(PYTEST_ARGS) $(PYTEST) $(PYTEST_ARGS)
.PHONY: test test-watch docs install .PHONY: test test-watch docs install

View File

@@ -3,31 +3,128 @@
# Released under the modified BSD license. See COPYING.md for more details. # Released under the modified BSD license. See COPYING.md for more details.
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Optional
import numpy as np import numpy as np
class Classifier(ABC): class Classifier(ABC):
@abstractmethod """
def fit(self, x_train, y_train): A Classifier decides which class each sample belongs to, based on historical
pass data.
"""
def __init__(self):
self.n_features: Optional[int] = None
self.n_classes: Optional[int] = None
@abstractmethod @abstractmethod
def predict_proba(self, x_test): def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
pass """
Trains the classifier.
def predict(self, x_test): Parameters
proba = self.predict_proba(x_test) ----------
assert isinstance(proba, np.ndarray) x_train: np.ndarray
assert proba.shape == (x_test.shape[0], 2) An array of features with shape (`n_samples`, `n_features`). Each entry
return (proba[:, 1] > 0.5).astype(float) must be a float.
y_train: np.ndarray
An array of labels with shape (`n_samples`, `n_classes`). Each entry must be
a bool, and there must be exactly one True element in each row.
"""
assert isinstance(x_train, np.ndarray)
assert isinstance(y_train, np.ndarray)
assert x_train.dtype in [np.float16, np.float32, np.float64]
assert y_train.dtype == np.bool8
assert len(x_train.shape) == 2
assert len(y_train.shape) == 2
(n_samples_x, n_features) = x_train.shape
(n_samples_y, n_classes) = y_train.shape
assert n_samples_y == n_samples_x
self.n_features = n_features
self.n_classes = n_classes
@abstractmethod
def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
"""
Predicts the probability of each sample belonging to each class. Must be called
after fit.
Parameters
----------
x_test: np.ndarray
An array of features with shape (`n_samples`, `n_features`). The number of
features in `x_test` must match the number of features in `x_train` provided
to `fit`.
Returns
-------
np.ndarray
An array of predicted probabilities with shape (`n_samples`, `n_classes`),
where `n_classes` is the number of columns in `y_train` provided to `fit`.
"""
assert self.n_features is not None
assert isinstance(x_test, np.ndarray)
assert len(x_test.shape) == 2
(n_samples, n_features_x) = x_test.shape
assert n_features_x == self.n_features
return np.ndarray([])
class Regressor(ABC): class Regressor(ABC):
@abstractmethod """
def fit(self, x_train, y_train): A Regressor tries to predict the values of some continous variables, given the
pass values of other variables.
"""
def __init__(self):
self.n_inputs: Optional[int] = None
@abstractmethod @abstractmethod
def predict(self): def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
pass """
Trains the regressor.
Parameters
----------
x_train: np.ndarray
An array of inputs with shape (`n_samples`, `n_inputs`). Each entry must be
a float.
y_train: np.ndarray
An array of outputs with shape (`n_samples`, `n_outputs`). Each entry must
be a float.
"""
assert isinstance(x_train, np.ndarray)
assert isinstance(y_train, np.ndarray)
assert x_train.dtype in [np.float16, np.float32, np.float64]
assert y_train.dtype in [np.float16, np.float32, np.float64]
assert len(x_train.shape) == 2
assert len(y_train.shape) == 2
(n_samples_x, n_inputs) = x_train.shape
(n_samples_y, n_outputs) = y_train.shape
assert n_samples_y == n_samples_x
self.n_inputs = n_inputs
@abstractmethod
def predict(self, x_test: np.ndarray) -> np.ndarray:
"""
Predicts the values of the output variables. Must be called after fit.
Parameters
----------
x_test: np.ndarray
An array of inputs with shape (`n_samples`, `n_inputs`), where `n_inputs`
must match the number of columns in `x_train` provided to `fit`.
Returns
-------
np.ndarray
An array of outputs with shape (`n_samples`, `n_outputs`), where
`n_outputs` is the number of columns in `y_train` provided to `fit`.
"""
assert self.n_inputs is not None
assert isinstance(x_test, np.ndarray)
assert len(x_test.shape) == 2
(n_samples, n_inputs_x) = x_test.shape
assert n_inputs_x == self.n_inputs
return np.ndarray([])

View File

@@ -37,6 +37,7 @@ class CrossValidatedClassifier(Classifier):
cv=5, cv=5,
scoring="accuracy", scoring="accuracy",
): ):
super().__init__()
self.classifier = None self.classifier = None
self.classifier_prototype = classifier self.classifier_prototype = classifier
self.constant = constant self.constant = constant
@@ -45,6 +46,8 @@ class CrossValidatedClassifier(Classifier):
self.scoring = scoring self.scoring = scoring
def fit(self, x_train, y_train): def fit(self, x_train, y_train):
# super().fit(x_train, y_train)
# Calculate dummy score and absolute score threshold # Calculate dummy score and absolute score threshold
y_train_avg = np.average(y_train) y_train_avg = np.average(y_train)
dummy_score = max(y_train_avg, 1 - y_train_avg) dummy_score = max(y_train_avg, 1 - y_train_avg)
@@ -83,4 +86,5 @@ class CrossValidatedClassifier(Classifier):
self.classifier.fit(x_train, y_train) self.classifier.fit(x_train, y_train)
def predict_proba(self, x_test): def predict_proba(self, x_test):
# super().predict_proba(x_test)
return self.classifier.predict_proba(x_test) return self.classifier.predict_proba(x_test)

View File

@@ -35,7 +35,9 @@ def test_cv():
cv=30, cv=30,
) )
clf.fit(x_train, y_train) clf.fit(x_train, y_train)
assert norm(np.zeros(n_samples) - clf.predict(x_train)) < E proba = clf.predict_proba(x_train)
y_pred = (proba[:, 1] > 0.5).astype(float)
assert norm(np.zeros(n_samples) - y_pred) < E
# Support vector machines with quadratic kernels perform almost perfectly # Support vector machines with quadratic kernels perform almost perfectly
# on this data set, so predictor should return their prediction. # on this data set, so predictor should return their prediction.
@@ -45,5 +47,6 @@ def test_cv():
cv=30, cv=30,
) )
clf.fit(x_train, y_train) clf.fit(x_train, y_train)
print(y_train - clf.predict(x_train)) proba = clf.predict_proba(x_train)
assert norm(y_train - clf.predict(x_train)) < E y_pred = (proba[:, 1] > 0.5).astype(float)
assert norm(y_train - y_pred) < E