Start refactoring of classifiers

2025-12-06 09:28:51 -06:00 · 2021-01-22 11:35:29 -06:00
parent b87ef651e1
commit 8dba65dd9c
9 changed files with 202 additions and 72 deletions
--- a/2
+++ b/2
@@ -43,6 +43,6 @@ reformat:
 test:
 	$(MYPY) -p miplearn
 	$(MYPY) -p tests
-	$(PYTEST) $(PYTEST_ARGS)
+	$(PYTEST) $(PYTEST_ARGS) tests/classifiers

 .PHONY: test test-watch docs install
--- a/docs/customization.md
+++ b/docs/customization.md
@@ -152,25 +152,18 @@ dtype: float64

 ### Using customized ML classifiers and regressors

-By default, given a training set of instantes, MIPLearn trains a fixed set of ML classifiers and regressors, then
-selects the best one based on cross-validation performance. Alternatively, the user may specify which ML model a component
-should use through the `classifier` or `regressor` contructor parameters. The provided classifiers and regressors must 
-follow the sklearn API. In particular, classifiers must provide the methods `fit`, `predict_proba` and `predict`,
-while regressors must provide the methods `fit` and `predict`
+By default, given a training set of instantes, MIPLearn trains a fixed set of ML classifiers and regressors, then selects the best one based on cross-validation performance. Alternatively, the user may specify which ML model a component should use through the `classifier` or `regressor` contructor parameters. Scikit-learn classifiers and regressors are currently supported. A future version of the package will add compatibility with Keras models.

-!!! danger
-    MIPLearn must be able to generate a copy of any custom ML classifiers and regressors through 
-    the standard  `copy.deepcopy` method. This currently makes it incompatible with Keras and TensorFlow
-    predictors. This is a known limitation, which will be addressed in a future version.
-    
-The example below shows how to construct a `PrimalSolutionComponent` which internally uses
-sklearn's `KNeighborsClassifiers`. Any other sklearn classifier or pipeline can be used. 
+The example below shows how to construct a `PrimalSolutionComponent` which internally uses scikit-learn's `KNeighborsClassifiers`. Any other scikit-learn classifier or pipeline can be used. The classifier needs to be provided as a lambda function because the component may need to create multiple copies of it. It needs to be wrapped in `ScikitLearnClassifier` to ensure that all the proper data transformations are applied.

 ```python
-from miplearn import PrimalSolutionComponent
+from miplearn import PrimalSolutionComponent, ScikitLearnClassifier
 from sklearn.neighbors import KNeighborsClassifier

-comp = PrimalSolutionComponent(classifier=KNeighborsClassifier(n_neighbors=5))
+comp = PrimalSolutionComponent(
+    classifier=lambda: ScikitLearnClassifier(
+        KNeighborsClassifier(n_neighbors=5),
+    ),
+)
 comp.fit(train_instances)
 ``` 
-  
--- a/miplearn/init.py
+++ b/miplearn/init.py
@@ -3,7 +3,11 @@
 #  Released under the modified BSD license. See COPYING.md for more details.

 from .benchmark import BenchmarkRunner
-from .classifiers import Classifier, Regressor
+from .classifiers import (
+    Classifier,
+    Regressor,
+    ScikitLearnClassifier,
+)
 from .classifiers.adaptive import AdaptiveClassifier
 from .classifiers.threshold import MinPrecisionThreshold
 from .components.component import Component
--- a/miplearn/classifiers/init.py
+++ b/miplearn/classifiers/init.py
@@ -3,7 +3,7 @@
 #  Released under the modified BSD license. See COPYING.md for more details.

 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Optional, Any

 import numpy as np

@@ -14,7 +14,7 @@ class Classifier(ABC):
    data.
    """

-    def __init__(self):
+    def __init__(self) -> None:
        self.n_features: Optional[int] = None
        self.n_classes: Optional[int] = None

@@ -77,7 +77,7 @@ class Regressor(ABC):
    values of other variables.
    """

-    def __init__(self):
+    def __init__(self) -> None:
        self.n_inputs: Optional[int] = None

    @abstractmethod
@@ -128,3 +128,38 @@ class Regressor(ABC):
        (n_samples, n_inputs_x) = x_test.shape
        assert n_inputs_x == self.n_inputs
        return np.ndarray([])
+
+
+class ScikitLearnClassifier(Classifier):
+    """
+    Wrapper for ScikitLearn classifiers, which makes sure inputs and outputs have the
+    correct dimensions and types.
+    """
+
+    def __init__(self, clf: Any) -> None:
+        super().__init__()
+        self.inner_clf = clf
+
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        super().fit(x_train, y_train)
+        (n_samples, n_classes) = x_train.shape
+        assert n_classes == 2, "scikit-learn classifiers must have exactly two classes"
+        self.inner_clf.fit(x_train, y_train[:, 1])
+
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        super().predict_proba(x_test)
+        n_samples = x_test.shape[0]
+        sklearn_proba = self.inner_clf.predict_proba(x_test)
+        if isinstance(sklearn_proba, list):
+            assert len(sklearn_proba) == self.n_classes
+            for pb in sklearn_proba:
+                assert isinstance(pb, np.ndarray)
+                assert pb.dtype in [np.float16, np.float32, np.float64]
+                assert pb.shape == (n_samples, 2)
+            proba = np.hstack([pb[:, [1]] for pb in sklearn_proba])
+            assert proba.shape == (n_samples, self.n_classes)
+            return proba
+        else:
+            assert isinstance(sklearn_proba, np.ndarray)
+            assert sklearn_proba.shape == (n_samples, 2)
+            return sklearn_proba
--- a/miplearn/classifiers/adaptive.py
+++ b/miplearn/classifiers/adaptive.py
@@ -29,9 +29,6 @@ class AdaptiveClassifier(Classifier):
        candidates: Dict[str, Any] = None,
        evaluator: ClassifierEvaluator = ClassifierEvaluator(),
    ) -> None:
-        """
-        Initializes the meta-classifier.
-        """
        if candidates is None:
            candidates = {
                "knn(100)": {
--- a/miplearn/classifiers/counting.py
+++ b/miplearn/classifiers/counting.py
@@ -1,6 +1,7 @@
 #  MIPLearn: Extensible Framework for Learning-Enhanced Mixed-Integer Optimization
 #  Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
 #  Released under the modified BSD license. See COPYING.md for more details.
+from typing import Optional, cast

 import numpy as np

@@ -9,20 +10,33 @@ from miplearn.classifiers import Classifier

 class CountingClassifier(Classifier):
    """
-    A classifier that generates constant predictions, based only on the
-    frequency of the training labels. For example, if y_train is [1.0, 0.0, 0.0]
-    this classifier always returns [0.66 0.33] for any x_test. It essentially
-    counts how many times each label appeared, hence the name.
+
+    A classifier that generates constant predictions, based only on the frequency of
+    the training labels. For example, suppose `y_train` is given by:
+    ```python
+    y_train = np.array([
+        [True, False],
+        [False, True],
+        [False, True],
+    ])
+    ```
+    Then `predict_proba` always returns `[0.33 0.66]` for every sample, regardless of
+    `x_train`. It essentially counts how many times each label appeared, hence the name.
+
    """

    def __init__(self) -> None:
-        self.mean = None
+        super().__init__()
+        self.mean: Optional[np.ndarray] = None

-    def fit(self, x_train, y_train):
-        self.mean = np.mean(y_train)
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        super().fit(x_train, y_train)
+        self.mean = cast(np.ndarray, np.mean(y_train, axis=0))

-    def predict_proba(self, x_test):
-        return np.array([[1 - self.mean, self.mean] for _ in range(x_test.shape[0])])
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        super().predict_proba(x_test)
+        n_samples = x_test.shape[0]
+        return np.array([self.mean for _ in range(n_samples)])

    def __repr__(self):
        return "CountingClassifier(mean=%s)" % self.mean
--- a/miplearn/classifiers/cv.py
+++ b/miplearn/classifiers/cv.py
@@ -3,14 +3,14 @@
 #  Released under the modified BSD license. See COPYING.md for more details.

 import logging
-from copy import deepcopy
+from typing import Optional, Callable, List

 import numpy as np
 from sklearn.dummy import DummyClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_val_score

-from miplearn.classifiers import Classifier
+from miplearn.classifiers import Classifier, ScikitLearnClassifier

 logger = logging.getLogger(__name__)

@@ -18,35 +18,58 @@ logger = logging.getLogger(__name__)
 class CrossValidatedClassifier(Classifier):
    """
    A meta-classifier that, upon training, evaluates the performance of another
-    classifier on the training data set using k-fold cross validation, then
-    either adopts the other classifier it if the cv-score is high enough, or
-    returns a constant label for every x_test otherwise.
+    candidate classifier on the training data set, using k-fold cross validation,
+    then either adopts it, if its cv-score is high enough, or returns constant
+    predictions for every x_test, otherwise.

-    The threshold is specified in comparison to a dummy classifier trained
-    on the same dataset. For example, a threshold of 0.0 indicates that any
-    classifier as good as the dummy predictor is acceptable. A threshold of 1.0
-    indicates that only classifier with a perfect cross-validation score are
-    acceptable. Other numbers are a linear interpolation of these two extremes.
+    Parameters
+    ----------
+    classifier: Callable[[], ScikitLearnClassifier]
+        A callable that constructs the candidate classifier.
+    threshold: float
+        Number from zero to one indicating how well must the candidate classifier
+        perform to be adopted. The threshold is specified in comparison to a dummy
+        classifier trained on the same dataset. For example, a threshold of 0.0
+        indicates that any classifier as good as the dummy predictor is acceptable. A
+        threshold of 1.0 indicates that only classifiers with perfect
+        cross-validation scores are acceptable. Other numbers are a linear
+        interpolation of these two extremes.
+    constant: Optional[List[bool]]
+        If the candidate classifier fails to meet the threshold, use a dummy classifier
+        which always returns this prediction instead. The list should have exactly as
+        many elements as the number of columns of `x_train` provided to `fit`.
+    cv: int
+        Number of folds.
+    scoring: str
+        Scoring function.
    """

    def __init__(
        self,
-        classifier=LogisticRegression(),
-        threshold=0.75,
-        constant=0.0,
-        cv=5,
-        scoring="accuracy",
+        classifier: Callable[[], ScikitLearnClassifier] = (
+            lambda: ScikitLearnClassifier(LogisticRegression())
+        ),
+        threshold: float = 0.75,
+        constant: Optional[List[bool]] = None,
+        cv: int = 5,
+        scoring: str = "accuracy",
    ):
+        """"""
        super().__init__()
-        self.classifier = None
-        self.classifier_prototype = classifier
-        self.constant = constant
+        if constant is None:
+            constant = [True, False]
+        self.n_classes = len(constant)
+        self.classifier: Optional[ScikitLearnClassifier] = None
+        self.classifier_factory = classifier
+        self.constant: List[bool] = constant
        self.threshold = threshold
        self.cv = cv
        self.scoring = scoring

-    def fit(self, x_train, y_train):
-        # super().fit(x_train, y_train)
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        super().fit(x_train, y_train)
+        (n_samples, n_classes) = x_train.shape
+        assert n_classes == self.n_classes

        # Calculate dummy score and absolute score threshold
        y_train_avg = np.average(y_train)
@@ -54,13 +77,20 @@ class CrossValidatedClassifier(Classifier):
        absolute_threshold = 1.0 * self.threshold + dummy_score * (1 - self.threshold)

        # Calculate cross validation score and decide which classifier to use
-        clf = deepcopy(self.classifier_prototype)
+        clf = self.classifier_factory()
+        assert clf is not None
+        assert isinstance(clf, ScikitLearnClassifier), (
+            f"The provided classifier callable must return a ScikitLearnClassifier. "
+            f"Found {clf.__class__.__name__} instead. If this is a scikit-learn "
+            f"classifier, you must wrap it with ScikitLearnClassifier."
+        )
+
        cv_score = float(
            np.mean(
                cross_val_score(
-                    clf,
+                    clf.inner_clf,
                    x_train,
-                    y_train,
+                    y_train[:, 1],
                    cv=self.cv,
                    scoring=self.scoring,
                )
@@ -77,14 +107,19 @@ class CrossValidatedClassifier(Classifier):
                "cv_score is below threshold (%.2f < %.2f); discarding"
                % (cv_score, absolute_threshold)
            )
-            self.classifier = DummyClassifier(
+            self.classifier = ScikitLearnClassifier(
+                DummyClassifier(
                    strategy="constant",
-                constant=self.constant,
+                    constant=self.constant[1],
+                )
            )

        # Train chosen classifier
+        assert self.classifier is not None
+        assert isinstance(self.classifier, ScikitLearnClassifier)
        self.classifier.fit(x_train, y_train)

-    def predict_proba(self, x_test):
-        # super().predict_proba(x_test)
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        super().predict_proba(x_test)
+        assert self.classifier is not None
        return self.classifier.predict_proba(x_test)
--- a/tests/classifiers/test_counting.py
+++ b/tests/classifiers/test_counting.py
@@ -12,7 +12,27 @@ E = 0.1

 def test_counting():
    clf = CountingClassifier()
-    clf.fit(np.zeros((8, 25)), [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
-    expected_proba = np.array([[0.375, 0.625], [0.375, 0.625]])
-    actual_proba = clf.predict_proba(np.zeros((2, 25)))
-    assert norm(actual_proba - expected_proba) < E
+    n_features = 25
+    x_train = np.zeros((8, n_features))
+    y_train = np.array(
+        [
+            [True, False, False],
+            [True, False, False],
+            [False, True, False],
+            [True, False, False],
+            [False, True, False],
+            [False, True, False],
+            [False, True, False],
+            [False, False, True],
+        ]
+    )
+    x_test = np.zeros((2, n_features))
+    y_expected = np.array(
+        [
+            [3 / 8.0, 4 / 8.0, 1 / 8.0],
+            [3 / 8.0, 4 / 8.0, 1 / 8.0],
+        ]
+    )
+    clf.fit(x_train, y_train)
+    y_actual = clf.predict_proba(x_test)
+    assert norm(y_actual - y_expected) < E
--- a/tests/classifiers/test_cv.py
+++ b/tests/classifiers/test_cv.py
@@ -7,20 +7,37 @@ from numpy.linalg import norm
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC

+from miplearn.classifiers import ScikitLearnClassifier
 from miplearn.classifiers.cv import CrossValidatedClassifier

 E = 0.1


-def test_cv():
+def test_cv() -> None:
    # Training set: label is true if point is inside a 2D circle
-    x_train = np.array([[x1, x2] for x1 in range(-10, 11) for x2 in range(-10, 11)])
+    x_train = np.array(
+        [
+            [
+                x1,
+                x2,
+            ]
+            for x1 in range(-10, 11)
+            for x2 in range(-10, 11)
+        ]
+    )
    x_train = StandardScaler().fit_transform(x_train)
    n_samples = x_train.shape[0]
-
    y_train = np.array(
        [
-            1.0 if x1 * x1 + x2 * x2 <= 100 else 0.0
+            [
+                False,
+                True,
+            ]
+            if x1 * x1 + x2 * x2 <= 100
+            else [
+                True,
+                False,
+            ]
            for x1 in range(-10, 11)
            for x2 in range(-10, 11)
        ]
@@ -29,24 +46,39 @@ def test_cv():
    # Support vector machines with linear kernels do not perform well on this
    # data set, so predictor should return the given constant.
    clf = CrossValidatedClassifier(
-        classifier=SVC(probability=True, random_state=42),
+        classifier=lambda: ScikitLearnClassifier(
+            SVC(
+                probability=True,
+                random_state=42,
+            )
+        ),
        threshold=0.90,
-        constant=0.0,
+        constant=[True, False],
        cv=30,
    )
    clf.fit(x_train, y_train)
    proba = clf.predict_proba(x_train)
+    assert isinstance(proba, np.ndarray)
+    assert proba.shape == (n_samples, 2)
+
    y_pred = (proba[:, 1] > 0.5).astype(float)
    assert norm(np.zeros(n_samples) - y_pred) < E

    # Support vector machines with quadratic kernels perform almost perfectly
    # on this data set, so predictor should return their prediction.
    clf = CrossValidatedClassifier(
-        classifier=SVC(probability=True, kernel="poly", degree=2, random_state=42),
+        classifier=lambda: ScikitLearnClassifier(
+            SVC(
+                probability=True,
+                kernel="poly",
+                degree=2,
+                random_state=42,
+            )
+        ),
        threshold=0.90,
        cv=30,
    )
    clf.fit(x_train, y_train)
    proba = clf.predict_proba(x_train)
    y_pred = (proba[:, 1] > 0.5).astype(float)
-    assert norm(y_train - y_pred) < E
+    assert norm(y_train[:, 1] - y_pred) < E