Document and simplify Classifier and Regressor

master
Alinson S. Xavier 5 years ago
parent f90d78f802
commit b87ef651e1

@ -42,6 +42,7 @@ reformat:
test: test:
$(MYPY) -p miplearn $(MYPY) -p miplearn
$(MYPY) -p tests
$(PYTEST) $(PYTEST_ARGS) $(PYTEST) $(PYTEST_ARGS)
.PHONY: test test-watch docs install .PHONY: test test-watch docs install

@ -3,31 +3,128 @@
# Released under the modified BSD license. See COPYING.md for more details. # Released under the modified BSD license. See COPYING.md for more details.
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Optional
import numpy as np import numpy as np
class Classifier(ABC): class Classifier(ABC):
"""
A Classifier decides which class each sample belongs to, based on historical
data.
"""
def __init__(self):
self.n_features: Optional[int] = None
self.n_classes: Optional[int] = None
@abstractmethod @abstractmethod
def fit(self, x_train, y_train): def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
pass """
Trains the classifier.
Parameters
----------
x_train: np.ndarray
An array of features with shape (`n_samples`, `n_features`). Each entry
must be a float.
y_train: np.ndarray
An array of labels with shape (`n_samples`, `n_classes`). Each entry must be
a bool, and there must be exactly one True element in each row.
"""
assert isinstance(x_train, np.ndarray)
assert isinstance(y_train, np.ndarray)
assert x_train.dtype in [np.float16, np.float32, np.float64]
assert y_train.dtype == np.bool8
assert len(x_train.shape) == 2
assert len(y_train.shape) == 2
(n_samples_x, n_features) = x_train.shape
(n_samples_y, n_classes) = y_train.shape
assert n_samples_y == n_samples_x
self.n_features = n_features
self.n_classes = n_classes
@abstractmethod @abstractmethod
def predict_proba(self, x_test): def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
pass """
Predicts the probability of each sample belonging to each class. Must be called
after fit.
def predict(self, x_test): Parameters
proba = self.predict_proba(x_test) ----------
assert isinstance(proba, np.ndarray) x_test: np.ndarray
assert proba.shape == (x_test.shape[0], 2) An array of features with shape (`n_samples`, `n_features`). The number of
return (proba[:, 1] > 0.5).astype(float) features in `x_test` must match the number of features in `x_train` provided
to `fit`.
Returns
-------
np.ndarray
An array of predicted probabilities with shape (`n_samples`, `n_classes`),
where `n_classes` is the number of columns in `y_train` provided to `fit`.
"""
assert self.n_features is not None
assert isinstance(x_test, np.ndarray)
assert len(x_test.shape) == 2
(n_samples, n_features_x) = x_test.shape
assert n_features_x == self.n_features
return np.ndarray([])
class Regressor(ABC): class Regressor(ABC):
"""
A Regressor tries to predict the values of some continous variables, given the
values of other variables.
"""
def __init__(self):
self.n_inputs: Optional[int] = None
@abstractmethod @abstractmethod
def fit(self, x_train, y_train): def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
pass """
Trains the regressor.
Parameters
----------
x_train: np.ndarray
An array of inputs with shape (`n_samples`, `n_inputs`). Each entry must be
a float.
y_train: np.ndarray
An array of outputs with shape (`n_samples`, `n_outputs`). Each entry must
be a float.
"""
assert isinstance(x_train, np.ndarray)
assert isinstance(y_train, np.ndarray)
assert x_train.dtype in [np.float16, np.float32, np.float64]
assert y_train.dtype in [np.float16, np.float32, np.float64]
assert len(x_train.shape) == 2
assert len(y_train.shape) == 2
(n_samples_x, n_inputs) = x_train.shape
(n_samples_y, n_outputs) = y_train.shape
assert n_samples_y == n_samples_x
self.n_inputs = n_inputs
@abstractmethod @abstractmethod
def predict(self): def predict(self, x_test: np.ndarray) -> np.ndarray:
pass """
Predicts the values of the output variables. Must be called after fit.
Parameters
----------
x_test: np.ndarray
An array of inputs with shape (`n_samples`, `n_inputs`), where `n_inputs`
must match the number of columns in `x_train` provided to `fit`.
Returns
-------
np.ndarray
An array of outputs with shape (`n_samples`, `n_outputs`), where
`n_outputs` is the number of columns in `y_train` provided to `fit`.
"""
assert self.n_inputs is not None
assert isinstance(x_test, np.ndarray)
assert len(x_test.shape) == 2
(n_samples, n_inputs_x) = x_test.shape
assert n_inputs_x == self.n_inputs
return np.ndarray([])

@ -37,6 +37,7 @@ class CrossValidatedClassifier(Classifier):
cv=5, cv=5,
scoring="accuracy", scoring="accuracy",
): ):
super().__init__()
self.classifier = None self.classifier = None
self.classifier_prototype = classifier self.classifier_prototype = classifier
self.constant = constant self.constant = constant
@ -45,6 +46,8 @@ class CrossValidatedClassifier(Classifier):
self.scoring = scoring self.scoring = scoring
def fit(self, x_train, y_train): def fit(self, x_train, y_train):
# super().fit(x_train, y_train)
# Calculate dummy score and absolute score threshold # Calculate dummy score and absolute score threshold
y_train_avg = np.average(y_train) y_train_avg = np.average(y_train)
dummy_score = max(y_train_avg, 1 - y_train_avg) dummy_score = max(y_train_avg, 1 - y_train_avg)
@ -83,4 +86,5 @@ class CrossValidatedClassifier(Classifier):
self.classifier.fit(x_train, y_train) self.classifier.fit(x_train, y_train)
def predict_proba(self, x_test): def predict_proba(self, x_test):
# super().predict_proba(x_test)
return self.classifier.predict_proba(x_test) return self.classifier.predict_proba(x_test)

@ -35,7 +35,9 @@ def test_cv():
cv=30, cv=30,
) )
clf.fit(x_train, y_train) clf.fit(x_train, y_train)
assert norm(np.zeros(n_samples) - clf.predict(x_train)) < E proba = clf.predict_proba(x_train)
y_pred = (proba[:, 1] > 0.5).astype(float)
assert norm(np.zeros(n_samples) - y_pred) < E
# Support vector machines with quadratic kernels perform almost perfectly # Support vector machines with quadratic kernels perform almost perfectly
# on this data set, so predictor should return their prediction. # on this data set, so predictor should return their prediction.
@ -45,5 +47,6 @@ def test_cv():
cv=30, cv=30,
) )
clf.fit(x_train, y_train) clf.fit(x_train, y_train)
print(y_train - clf.predict(x_train)) proba = clf.predict_proba(x_train)
assert norm(y_train - clf.predict(x_train)) < E y_pred = (proba[:, 1] > 0.5).astype(float)
assert norm(y_train - y_pred) < E

Loading…
Cancel
Save