Document and simplify Classifier and Regressor

2025-12-06 09:28:51 -06:00 · 2021-01-22 09:06:04 -06:00
parent f90d78f802
commit b87ef651e1
4 changed files with 123 additions and 18 deletions
--- a/1
+++ b/1
@@ -42,6 +42,7 @@ reformat:

 test:
 	$(MYPY) -p miplearn
+	$(MYPY) -p tests
 	$(PYTEST) $(PYTEST_ARGS)

 .PHONY: test test-watch docs install
--- a/miplearn/classifiers/init.py
+++ b/miplearn/classifiers/init.py
@@ -3,31 +3,128 @@
 #  Released under the modified BSD license. See COPYING.md for more details.

 from abc import ABC, abstractmethod
+from typing import Optional

 import numpy as np


 class Classifier(ABC):
-    @abstractmethod
-    def fit(self, x_train, y_train):
-        pass
+    """
+    A Classifier decides which class each sample belongs to, based on historical
+    data.
+    """
+
+    def __init__(self):
+        self.n_features: Optional[int] = None
+        self.n_classes: Optional[int] = None

    @abstractmethod
-    def predict_proba(self, x_test):
-        pass
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        """
+        Trains the classifier.

-    def predict(self, x_test):
-        proba = self.predict_proba(x_test)
-        assert isinstance(proba, np.ndarray)
-        assert proba.shape == (x_test.shape[0], 2)
-        return (proba[:, 1] > 0.5).astype(float)
+        Parameters
+        ----------
+        x_train: np.ndarray
+            An array of features with shape (`n_samples`, `n_features`). Each entry
+            must be a float.
+        y_train: np.ndarray
+            An array of labels with shape (`n_samples`, `n_classes`). Each entry must be
+            a bool, and there must be exactly one True element in each row.
+        """
+        assert isinstance(x_train, np.ndarray)
+        assert isinstance(y_train, np.ndarray)
+        assert x_train.dtype in [np.float16, np.float32, np.float64]
+        assert y_train.dtype == np.bool8
+        assert len(x_train.shape) == 2
+        assert len(y_train.shape) == 2
+        (n_samples_x, n_features) = x_train.shape
+        (n_samples_y, n_classes) = y_train.shape
+        assert n_samples_y == n_samples_x
+        self.n_features = n_features
+        self.n_classes = n_classes
+
+    @abstractmethod
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        """
+        Predicts the probability of each sample belonging to each class. Must be called
+        after fit.
+
+        Parameters
+        ----------
+        x_test: np.ndarray
+            An array of features with shape (`n_samples`, `n_features`). The number of
+            features in `x_test` must match the number of features in `x_train` provided
+            to `fit`.
+
+        Returns
+        -------
+        np.ndarray
+            An array of predicted probabilities with shape (`n_samples`, `n_classes`),
+            where `n_classes` is the number of columns in `y_train` provided to `fit`.
+        """
+        assert self.n_features is not None
+        assert isinstance(x_test, np.ndarray)
+        assert len(x_test.shape) == 2
+        (n_samples, n_features_x) = x_test.shape
+        assert n_features_x == self.n_features
+        return np.ndarray([])


 class Regressor(ABC):
-    @abstractmethod
-    def fit(self, x_train, y_train):
-        pass
+    """
+    A Regressor tries to predict the values of some continous variables, given the
+    values of other variables.
+    """
+
+    def __init__(self):
+        self.n_inputs: Optional[int] = None

    @abstractmethod
-    def predict(self):
-        pass
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        """
+        Trains the regressor.
+
+        Parameters
+        ----------
+        x_train: np.ndarray
+            An array of inputs with shape (`n_samples`, `n_inputs`). Each entry must be
+            a float.
+        y_train: np.ndarray
+            An array of outputs with shape (`n_samples`, `n_outputs`). Each entry must
+            be a float.
+        """
+        assert isinstance(x_train, np.ndarray)
+        assert isinstance(y_train, np.ndarray)
+        assert x_train.dtype in [np.float16, np.float32, np.float64]
+        assert y_train.dtype in [np.float16, np.float32, np.float64]
+        assert len(x_train.shape) == 2
+        assert len(y_train.shape) == 2
+        (n_samples_x, n_inputs) = x_train.shape
+        (n_samples_y, n_outputs) = y_train.shape
+        assert n_samples_y == n_samples_x
+        self.n_inputs = n_inputs
+
+    @abstractmethod
+    def predict(self, x_test: np.ndarray) -> np.ndarray:
+        """
+        Predicts the values of the output variables. Must be called after fit.
+
+        Parameters
+        ----------
+        x_test: np.ndarray
+            An array of inputs with shape (`n_samples`, `n_inputs`), where `n_inputs`
+            must match the number of columns in `x_train` provided to `fit`.
+
+        Returns
+        -------
+        np.ndarray
+            An array of outputs  with shape (`n_samples`, `n_outputs`), where
+            `n_outputs` is the number of columns in `y_train` provided to `fit`.
+        """
+        assert self.n_inputs is not None
+        assert isinstance(x_test, np.ndarray)
+        assert len(x_test.shape) == 2
+        (n_samples, n_inputs_x) = x_test.shape
+        assert n_inputs_x == self.n_inputs
+        return np.ndarray([])
--- a/miplearn/classifiers/cv.py
+++ b/miplearn/classifiers/cv.py
@@ -37,6 +37,7 @@ class CrossValidatedClassifier(Classifier):
        cv=5,
        scoring="accuracy",
    ):
+        super().__init__()
        self.classifier = None
        self.classifier_prototype = classifier
        self.constant = constant
@@ -45,6 +46,8 @@ class CrossValidatedClassifier(Classifier):
        self.scoring = scoring

    def fit(self, x_train, y_train):
+        # super().fit(x_train, y_train)
+
        # Calculate dummy score and absolute score threshold
        y_train_avg = np.average(y_train)
        dummy_score = max(y_train_avg, 1 - y_train_avg)
@@ -83,4 +86,5 @@ class CrossValidatedClassifier(Classifier):
        self.classifier.fit(x_train, y_train)

    def predict_proba(self, x_test):
+        # super().predict_proba(x_test)
        return self.classifier.predict_proba(x_test)
--- a/tests/classifiers/test_cv.py
+++ b/tests/classifiers/test_cv.py
@@ -35,7 +35,9 @@ def test_cv():
        cv=30,
    )
    clf.fit(x_train, y_train)
-    assert norm(np.zeros(n_samples) - clf.predict(x_train)) < E
+    proba = clf.predict_proba(x_train)
+    y_pred = (proba[:, 1] > 0.5).astype(float)
+    assert norm(np.zeros(n_samples) - y_pred) < E

    # Support vector machines with quadratic kernels perform almost perfectly
    # on this data set, so predictor should return their prediction.
@@ -45,5 +47,6 @@ def test_cv():
        cv=30,
    )
    clf.fit(x_train, y_train)
-    print(y_train - clf.predict(x_train))
-    assert norm(y_train - clf.predict(x_train)) < E
+    proba = clf.predict_proba(x_train)
+    y_pred = (proba[:, 1] > 0.5).astype(float)
+    assert norm(y_train - y_pred) < E