Start refactoring of classifiers

2025-12-06 01:18:52 -06:00 · 2021-01-22 11:35:29 -06:00
parent b87ef651e1
commit 8dba65dd9c
9 changed files with 202 additions and 72 deletions
--- a/miplearn/classifiers/init.py
+++ b/miplearn/classifiers/init.py
@@ -3,7 +3,7 @@
 #  Released under the modified BSD license. See COPYING.md for more details.

 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Optional, Any

 import numpy as np

@@ -14,7 +14,7 @@ class Classifier(ABC):
    data.
    """

-    def __init__(self):
+    def __init__(self) -> None:
        self.n_features: Optional[int] = None
        self.n_classes: Optional[int] = None

@@ -77,7 +77,7 @@ class Regressor(ABC):
    values of other variables.
    """

-    def __init__(self):
+    def __init__(self) -> None:
        self.n_inputs: Optional[int] = None

    @abstractmethod
@@ -128,3 +128,38 @@ class Regressor(ABC):
        (n_samples, n_inputs_x) = x_test.shape
        assert n_inputs_x == self.n_inputs
        return np.ndarray([])
+
+
+class ScikitLearnClassifier(Classifier):
+    """
+    Wrapper for ScikitLearn classifiers, which makes sure inputs and outputs have the
+    correct dimensions and types.
+    """
+
+    def __init__(self, clf: Any) -> None:
+        super().__init__()
+        self.inner_clf = clf
+
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        super().fit(x_train, y_train)
+        (n_samples, n_classes) = x_train.shape
+        assert n_classes == 2, "scikit-learn classifiers must have exactly two classes"
+        self.inner_clf.fit(x_train, y_train[:, 1])
+
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        super().predict_proba(x_test)
+        n_samples = x_test.shape[0]
+        sklearn_proba = self.inner_clf.predict_proba(x_test)
+        if isinstance(sklearn_proba, list):
+            assert len(sklearn_proba) == self.n_classes
+            for pb in sklearn_proba:
+                assert isinstance(pb, np.ndarray)
+                assert pb.dtype in [np.float16, np.float32, np.float64]
+                assert pb.shape == (n_samples, 2)
+            proba = np.hstack([pb[:, [1]] for pb in sklearn_proba])
+            assert proba.shape == (n_samples, self.n_classes)
+            return proba
+        else:
+            assert isinstance(sklearn_proba, np.ndarray)
+            assert sklearn_proba.shape == (n_samples, 2)
+            return sklearn_proba
--- a/miplearn/classifiers/adaptive.py
+++ b/miplearn/classifiers/adaptive.py
@@ -29,9 +29,6 @@ class AdaptiveClassifier(Classifier):
        candidates: Dict[str, Any] = None,
        evaluator: ClassifierEvaluator = ClassifierEvaluator(),
    ) -> None:
-        """
-        Initializes the meta-classifier.
-        """
        if candidates is None:
            candidates = {
                "knn(100)": {
--- a/miplearn/classifiers/counting.py
+++ b/miplearn/classifiers/counting.py
@@ -1,6 +1,7 @@
 #  MIPLearn: Extensible Framework for Learning-Enhanced Mixed-Integer Optimization
 #  Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
 #  Released under the modified BSD license. See COPYING.md for more details.
+from typing import Optional, cast

 import numpy as np

@@ -9,20 +10,33 @@ from miplearn.classifiers import Classifier

 class CountingClassifier(Classifier):
    """
-    A classifier that generates constant predictions, based only on the
-    frequency of the training labels. For example, if y_train is [1.0, 0.0, 0.0]
-    this classifier always returns [0.66 0.33] for any x_test. It essentially
-    counts how many times each label appeared, hence the name.
+
+    A classifier that generates constant predictions, based only on the frequency of
+    the training labels. For example, suppose `y_train` is given by:
+    ```python
+    y_train = np.array([
+        [True, False],
+        [False, True],
+        [False, True],
+    ])
+    ```
+    Then `predict_proba` always returns `[0.33 0.66]` for every sample, regardless of
+    `x_train`. It essentially counts how many times each label appeared, hence the name.
+
    """

    def __init__(self) -> None:
-        self.mean = None
+        super().__init__()
+        self.mean: Optional[np.ndarray] = None

-    def fit(self, x_train, y_train):
-        self.mean = np.mean(y_train)
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        super().fit(x_train, y_train)
+        self.mean = cast(np.ndarray, np.mean(y_train, axis=0))

-    def predict_proba(self, x_test):
-        return np.array([[1 - self.mean, self.mean] for _ in range(x_test.shape[0])])
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        super().predict_proba(x_test)
+        n_samples = x_test.shape[0]
+        return np.array([self.mean for _ in range(n_samples)])

    def __repr__(self):
        return "CountingClassifier(mean=%s)" % self.mean
--- a/miplearn/classifiers/cv.py
+++ b/miplearn/classifiers/cv.py
@@ -3,14 +3,14 @@
 #  Released under the modified BSD license. See COPYING.md for more details.

 import logging
-from copy import deepcopy
+from typing import Optional, Callable, List

 import numpy as np
 from sklearn.dummy import DummyClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_val_score

-from miplearn.classifiers import Classifier
+from miplearn.classifiers import Classifier, ScikitLearnClassifier

 logger = logging.getLogger(__name__)

@@ -18,35 +18,58 @@ logger = logging.getLogger(__name__)
 class CrossValidatedClassifier(Classifier):
    """
    A meta-classifier that, upon training, evaluates the performance of another
-    classifier on the training data set using k-fold cross validation, then
-    either adopts the other classifier it if the cv-score is high enough, or
-    returns a constant label for every x_test otherwise.
+    candidate classifier on the training data set, using k-fold cross validation,
+    then either adopts it, if its cv-score is high enough, or returns constant
+    predictions for every x_test, otherwise.

-    The threshold is specified in comparison to a dummy classifier trained
-    on the same dataset. For example, a threshold of 0.0 indicates that any
-    classifier as good as the dummy predictor is acceptable. A threshold of 1.0
-    indicates that only classifier with a perfect cross-validation score are
-    acceptable. Other numbers are a linear interpolation of these two extremes.
+    Parameters
+    ----------
+    classifier: Callable[[], ScikitLearnClassifier]
+        A callable that constructs the candidate classifier.
+    threshold: float
+        Number from zero to one indicating how well must the candidate classifier
+        perform to be adopted. The threshold is specified in comparison to a dummy
+        classifier trained on the same dataset. For example, a threshold of 0.0
+        indicates that any classifier as good as the dummy predictor is acceptable. A
+        threshold of 1.0 indicates that only classifiers with perfect
+        cross-validation scores are acceptable. Other numbers are a linear
+        interpolation of these two extremes.
+    constant: Optional[List[bool]]
+        If the candidate classifier fails to meet the threshold, use a dummy classifier
+        which always returns this prediction instead. The list should have exactly as
+        many elements as the number of columns of `x_train` provided to `fit`.
+    cv: int
+        Number of folds.
+    scoring: str
+        Scoring function.
    """

    def __init__(
        self,
-        classifier=LogisticRegression(),
-        threshold=0.75,
-        constant=0.0,
-        cv=5,
-        scoring="accuracy",
+        classifier: Callable[[], ScikitLearnClassifier] = (
+            lambda: ScikitLearnClassifier(LogisticRegression())
+        ),
+        threshold: float = 0.75,
+        constant: Optional[List[bool]] = None,
+        cv: int = 5,
+        scoring: str = "accuracy",
    ):
+        """"""
        super().__init__()
-        self.classifier = None
-        self.classifier_prototype = classifier
-        self.constant = constant
+        if constant is None:
+            constant = [True, False]
+        self.n_classes = len(constant)
+        self.classifier: Optional[ScikitLearnClassifier] = None
+        self.classifier_factory = classifier
+        self.constant: List[bool] = constant
        self.threshold = threshold
        self.cv = cv
        self.scoring = scoring

-    def fit(self, x_train, y_train):
-        # super().fit(x_train, y_train)
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        super().fit(x_train, y_train)
+        (n_samples, n_classes) = x_train.shape
+        assert n_classes == self.n_classes

        # Calculate dummy score and absolute score threshold
        y_train_avg = np.average(y_train)
@@ -54,13 +77,20 @@ class CrossValidatedClassifier(Classifier):
        absolute_threshold = 1.0 * self.threshold + dummy_score * (1 - self.threshold)

        # Calculate cross validation score and decide which classifier to use
-        clf = deepcopy(self.classifier_prototype)
+        clf = self.classifier_factory()
+        assert clf is not None
+        assert isinstance(clf, ScikitLearnClassifier), (
+            f"The provided classifier callable must return a ScikitLearnClassifier. "
+            f"Found {clf.__class__.__name__} instead. If this is a scikit-learn "
+            f"classifier, you must wrap it with ScikitLearnClassifier."
+        )
+
        cv_score = float(
            np.mean(
                cross_val_score(
-                    clf,
+                    clf.inner_clf,
                    x_train,
-                    y_train,
+                    y_train[:, 1],
                    cv=self.cv,
                    scoring=self.scoring,
                )
@@ -77,14 +107,19 @@ class CrossValidatedClassifier(Classifier):
                "cv_score is below threshold (%.2f < %.2f); discarding"
                % (cv_score, absolute_threshold)
            )
-            self.classifier = DummyClassifier(
-                strategy="constant",
-                constant=self.constant,
+            self.classifier = ScikitLearnClassifier(
+                DummyClassifier(
+                    strategy="constant",
+                    constant=self.constant[1],
+                )
            )

        # Train chosen classifier
+        assert self.classifier is not None
+        assert isinstance(self.classifier, ScikitLearnClassifier)
        self.classifier.fit(x_train, y_train)

-    def predict_proba(self, x_test):
-        # super().predict_proba(x_test)
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        super().predict_proba(x_test)
+        assert self.classifier is not None
        return self.classifier.predict_proba(x_test)