Document and simplify Classifier and Regressor

2025-12-06 09:28:51 -06:00 · 2021-01-22 09:06:04 -06:00
parent f90d78f802
commit b87ef651e1
4 changed files with 123 additions and 18 deletions
--- a/1
+++ b/1
@@ -42,6 +42,7 @@ reformat:
 test:
 	$(MYPY) -p miplearn
 	$(MYPY) -p tests
 	$(PYTEST) $(PYTEST_ARGS)
 .PHONY: test test-watch docs install
--- a/miplearn/classifiers/init.py
+++ b/miplearn/classifiers/init.py
@@ -3,31 +3,128 @@
 #  Released under the modified BSD license. See COPYING.md for more details.
 from abc import ABC, abstractmethod
 from typing import Optional
 import numpy as np
 class Classifier(ABC):
-    @abstractmethod
+    """
-    def fit(self, x_train, y_train):
+    A Classifier decides which class each sample belongs to, based on historical
-        pass
+    data.
    """
    def __init__(self):
        self.n_features: Optional[int] = None
        self.n_classes: Optional[int] = None
    @abstractmethod
-    def predict_proba(self, x_test):
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
-        pass
+        """
        Trains the classifier.
-    def predict(self, x_test):
+        Parameters
-        proba = self.predict_proba(x_test)
+        ----------
-        assert isinstance(proba, np.ndarray)
+        x_train: np.ndarray
-        assert proba.shape == (x_test.shape[0], 2)
+            An array of features with shape (`n_samples`, `n_features`). Each entry
-        return (proba[:, 1] > 0.5).astype(float)
+            must be a float.
        y_train: np.ndarray
            An array of labels with shape (`n_samples`, `n_classes`). Each entry must be
            a bool, and there must be exactly one True element in each row.
        """
        assert isinstance(x_train, np.ndarray)
        assert isinstance(y_train, np.ndarray)
        assert x_train.dtype in [np.float16, np.float32, np.float64]
        assert y_train.dtype == np.bool8
        assert len(x_train.shape) == 2
        assert len(y_train.shape) == 2
        (n_samples_x, n_features) = x_train.shape
        (n_samples_y, n_classes) = y_train.shape
        assert n_samples_y == n_samples_x
        self.n_features = n_features
        self.n_classes = n_classes
    @abstractmethod
    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
        """
        Predicts the probability of each sample belonging to each class. Must be called
        after fit.
        Parameters
        ----------
        x_test: np.ndarray
            An array of features with shape (`n_samples`, `n_features`). The number of
            features in `x_test` must match the number of features in `x_train` provided
            to `fit`.
        Returns
        -------
        np.ndarray
            An array of predicted probabilities with shape (`n_samples`, `n_classes`),
            where `n_classes` is the number of columns in `y_train` provided to `fit`.
        """
        assert self.n_features is not None
        assert isinstance(x_test, np.ndarray)
        assert len(x_test.shape) == 2
        (n_samples, n_features_x) = x_test.shape
        assert n_features_x == self.n_features
        return np.ndarray([])
 class Regressor(ABC):
-    @abstractmethod
+    """
-    def fit(self, x_train, y_train):
+    A Regressor tries to predict the values of some continous variables, given the
-        pass
+    values of other variables.
    """
    def __init__(self):
        self.n_inputs: Optional[int] = None
    @abstractmethod
-    def predict(self):
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
-        pass
+        """
        Trains the regressor.
        Parameters
        ----------
        x_train: np.ndarray
            An array of inputs with shape (`n_samples`, `n_inputs`). Each entry must be
            a float.
        y_train: np.ndarray
            An array of outputs with shape (`n_samples`, `n_outputs`). Each entry must
            be a float.
        """
        assert isinstance(x_train, np.ndarray)
        assert isinstance(y_train, np.ndarray)
        assert x_train.dtype in [np.float16, np.float32, np.float64]
        assert y_train.dtype in [np.float16, np.float32, np.float64]
        assert len(x_train.shape) == 2
        assert len(y_train.shape) == 2
        (n_samples_x, n_inputs) = x_train.shape
        (n_samples_y, n_outputs) = y_train.shape
        assert n_samples_y == n_samples_x
        self.n_inputs = n_inputs
    @abstractmethod
    def predict(self, x_test: np.ndarray) -> np.ndarray:
        """
        Predicts the values of the output variables. Must be called after fit.
        Parameters
        ----------
        x_test: np.ndarray
            An array of inputs with shape (`n_samples`, `n_inputs`), where `n_inputs`
            must match the number of columns in `x_train` provided to `fit`.
        Returns
        -------
        np.ndarray
            An array of outputs  with shape (`n_samples`, `n_outputs`), where
            `n_outputs` is the number of columns in `y_train` provided to `fit`.
        """
        assert self.n_inputs is not None
        assert isinstance(x_test, np.ndarray)
        assert len(x_test.shape) == 2
        (n_samples, n_inputs_x) = x_test.shape
        assert n_inputs_x == self.n_inputs
        return np.ndarray([])
--- a/miplearn/classifiers/cv.py
+++ b/miplearn/classifiers/cv.py
@@ -37,6 +37,7 @@ class CrossValidatedClassifier(Classifier):
        cv=5,
        scoring="accuracy",
    ):
        super().__init__()
        self.classifier = None
        self.classifier_prototype = classifier
        self.constant = constant
@@ -45,6 +46,8 @@ class CrossValidatedClassifier(Classifier):
        self.scoring = scoring
    def fit(self, x_train, y_train):
        # super().fit(x_train, y_train)
        # Calculate dummy score and absolute score threshold
        y_train_avg = np.average(y_train)
        dummy_score = max(y_train_avg, 1 - y_train_avg)
@@ -83,4 +86,5 @@ class CrossValidatedClassifier(Classifier):
        self.classifier.fit(x_train, y_train)
    def predict_proba(self, x_test):
        # super().predict_proba(x_test)
        return self.classifier.predict_proba(x_test)
--- a/tests/classifiers/test_cv.py
+++ b/tests/classifiers/test_cv.py
@@ -35,7 +35,9 @@ def test_cv():
        cv=30,
    )
    clf.fit(x_train, y_train)
-    assert norm(np.zeros(n_samples) - clf.predict(x_train)) < E
+    proba = clf.predict_proba(x_train)
    y_pred = (proba[:, 1] > 0.5).astype(float)
    assert norm(np.zeros(n_samples) - y_pred) < E
    # Support vector machines with quadratic kernels perform almost perfectly
    # on this data set, so predictor should return their prediction.
@@ -45,5 +47,6 @@ def test_cv():
        cv=30,
    )
    clf.fit(x_train, y_train)
-    print(y_train - clf.predict(x_train))
+    proba = clf.predict_proba(x_train)
-    assert norm(y_train - clf.predict(x_train)) < E
+    y_pred = (proba[:, 1] > 0.5).astype(float)
    assert norm(y_train - y_pred) < E