Start refactoring of classifiers

2025-12-06 09:28:51 -06:00 · 2021-01-22 11:35:29 -06:00
parent b87ef651e1
commit 8dba65dd9c
9 changed files with 202 additions and 72 deletions
--- a/2
+++ b/2
@@ -43,6 +43,6 @@ reformat:
 test:
 	$(MYPY) -p miplearn
 	$(MYPY) -p tests
-	$(PYTEST) $(PYTEST_ARGS)
+	$(PYTEST) $(PYTEST_ARGS) tests/classifiers
 .PHONY: test test-watch docs install
--- a/docs/customization.md
+++ b/docs/customization.md
@@ -152,25 +152,18 @@ dtype: float64
 ### Using customized ML classifiers and regressors
-By default, given a training set of instantes, MIPLearn trains a fixed set of ML classifiers and regressors, then
+By default, given a training set of instantes, MIPLearn trains a fixed set of ML classifiers and regressors, then selects the best one based on cross-validation performance. Alternatively, the user may specify which ML model a component should use through the `classifier` or `regressor` contructor parameters. Scikit-learn classifiers and regressors are currently supported. A future version of the package will add compatibility with Keras models.
 selects the best one based on cross-validation performance. Alternatively, the user may specify which ML model a component
 should use through the `classifier` or `regressor` contructor parameters. The provided classifiers and regressors must 
 follow the sklearn API. In particular, classifiers must provide the methods `fit`, `predict_proba` and `predict`,
 while regressors must provide the methods `fit` and `predict`
-!!! danger
+The example below shows how to construct a `PrimalSolutionComponent` which internally uses scikit-learn's `KNeighborsClassifiers`. Any other scikit-learn classifier or pipeline can be used. The classifier needs to be provided as a lambda function because the component may need to create multiple copies of it. It needs to be wrapped in `ScikitLearnClassifier` to ensure that all the proper data transformations are applied.
    MIPLearn must be able to generate a copy of any custom ML classifiers and regressors through 
    the standard  `copy.deepcopy` method. This currently makes it incompatible with Keras and TensorFlow
    predictors. This is a known limitation, which will be addressed in a future version.
 The example below shows how to construct a `PrimalSolutionComponent` which internally uses
 sklearn's `KNeighborsClassifiers`. Any other sklearn classifier or pipeline can be used. 
 ```python
-from miplearn import PrimalSolutionComponent
+from miplearn import PrimalSolutionComponent, ScikitLearnClassifier
 from sklearn.neighbors import KNeighborsClassifier
-comp = PrimalSolutionComponent(classifier=KNeighborsClassifier(n_neighbors=5))
+comp = PrimalSolutionComponent(
    classifier=lambda: ScikitLearnClassifier(
        KNeighborsClassifier(n_neighbors=5),
    ),
 )
 comp.fit(train_instances)
 ``` 
--- a/miplearn/init.py
+++ b/miplearn/init.py
@@ -3,7 +3,11 @@
 #  Released under the modified BSD license. See COPYING.md for more details.
 from .benchmark import BenchmarkRunner
-from .classifiers import Classifier, Regressor
+from .classifiers import (
    Classifier,
    Regressor,
    ScikitLearnClassifier,
 )
 from .classifiers.adaptive import AdaptiveClassifier
 from .classifiers.threshold import MinPrecisionThreshold
 from .components.component import Component
--- a/miplearn/classifiers/init.py
+++ b/miplearn/classifiers/init.py
@@ -3,7 +3,7 @@
 #  Released under the modified BSD license. See COPYING.md for more details.
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Optional, Any
 import numpy as np
@@ -14,7 +14,7 @@ class Classifier(ABC):
    data.
    """
-    def __init__(self):
+    def __init__(self) -> None:
        self.n_features: Optional[int] = None
        self.n_classes: Optional[int] = None
@@ -77,7 +77,7 @@ class Regressor(ABC):
    values of other variables.
    """
-    def __init__(self):
+    def __init__(self) -> None:
        self.n_inputs: Optional[int] = None
    @abstractmethod
@@ -128,3 +128,38 @@ class Regressor(ABC):
        (n_samples, n_inputs_x) = x_test.shape
        assert n_inputs_x == self.n_inputs
        return np.ndarray([])
 class ScikitLearnClassifier(Classifier):
    """
    Wrapper for ScikitLearn classifiers, which makes sure inputs and outputs have the
    correct dimensions and types.
    """
    def __init__(self, clf: Any) -> None:
        super().__init__()
        self.inner_clf = clf
    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
        super().fit(x_train, y_train)
        (n_samples, n_classes) = x_train.shape
        assert n_classes == 2, "scikit-learn classifiers must have exactly two classes"
        self.inner_clf.fit(x_train, y_train[:, 1])
    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
        super().predict_proba(x_test)
        n_samples = x_test.shape[0]
        sklearn_proba = self.inner_clf.predict_proba(x_test)
        if isinstance(sklearn_proba, list):
            assert len(sklearn_proba) == self.n_classes
            for pb in sklearn_proba:
                assert isinstance(pb, np.ndarray)
                assert pb.dtype in [np.float16, np.float32, np.float64]
                assert pb.shape == (n_samples, 2)
            proba = np.hstack([pb[:, [1]] for pb in sklearn_proba])
            assert proba.shape == (n_samples, self.n_classes)
            return proba
        else:
            assert isinstance(sklearn_proba, np.ndarray)
            assert sklearn_proba.shape == (n_samples, 2)
            return sklearn_proba
--- a/miplearn/classifiers/adaptive.py
+++ b/miplearn/classifiers/adaptive.py
@@ -29,9 +29,6 @@ class AdaptiveClassifier(Classifier):
        candidates: Dict[str, Any] = None,
        evaluator: ClassifierEvaluator = ClassifierEvaluator(),
    ) -> None:
        """
        Initializes the meta-classifier.
        """
        if candidates is None:
            candidates = {
                "knn(100)": {
--- a/miplearn/classifiers/counting.py
+++ b/miplearn/classifiers/counting.py
@@ -1,6 +1,7 @@
 #  MIPLearn: Extensible Framework for Learning-Enhanced Mixed-Integer Optimization
 #  Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
 #  Released under the modified BSD license. See COPYING.md for more details.
 from typing import Optional, cast
 import numpy as np
@@ -9,20 +10,33 @@ from miplearn.classifiers import Classifier
 class CountingClassifier(Classifier):
    """
-    A classifier that generates constant predictions, based only on the
+
-    frequency of the training labels. For example, if y_train is [1.0, 0.0, 0.0]
+    A classifier that generates constant predictions, based only on the frequency of
-    this classifier always returns [0.66 0.33] for any x_test. It essentially
+    the training labels. For example, suppose `y_train` is given by:
-    counts how many times each label appeared, hence the name.
+    ```python
    y_train = np.array([
        [True, False],
        [False, True],
        [False, True],
    ])
    ```
    Then `predict_proba` always returns `[0.33 0.66]` for every sample, regardless of
    `x_train`. It essentially counts how many times each label appeared, hence the name.
    """
    def __init__(self) -> None:
-        self.mean = None
+        super().__init__()
        self.mean: Optional[np.ndarray] = None
-    def fit(self, x_train, y_train):
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
-        self.mean = np.mean(y_train)
+        super().fit(x_train, y_train)
        self.mean = cast(np.ndarray, np.mean(y_train, axis=0))
-    def predict_proba(self, x_test):
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
-        return np.array([[1 - self.mean, self.mean] for _ in range(x_test.shape[0])])
+        super().predict_proba(x_test)
        n_samples = x_test.shape[0]
        return np.array([self.mean for _ in range(n_samples)])
    def __repr__(self):
        return "CountingClassifier(mean=%s)" % self.mean
--- a/miplearn/classifiers/cv.py
+++ b/miplearn/classifiers/cv.py
@@ -3,14 +3,14 @@
 #  Released under the modified BSD license. See COPYING.md for more details.
 import logging
-from copy import deepcopy
+from typing import Optional, Callable, List
 import numpy as np
 from sklearn.dummy import DummyClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_val_score
-from miplearn.classifiers import Classifier
+from miplearn.classifiers import Classifier, ScikitLearnClassifier
 logger = logging.getLogger(__name__)
@@ -18,35 +18,58 @@ logger = logging.getLogger(__name__)
 class CrossValidatedClassifier(Classifier):
    """
    A meta-classifier that, upon training, evaluates the performance of another
-    classifier on the training data set using k-fold cross validation, then
+    candidate classifier on the training data set, using k-fold cross validation,
-    either adopts the other classifier it if the cv-score is high enough, or
+    then either adopts it, if its cv-score is high enough, or returns constant
-    returns a constant label for every x_test otherwise.
+    predictions for every x_test, otherwise.
-    The threshold is specified in comparison to a dummy classifier trained
+    Parameters
-    on the same dataset. For example, a threshold of 0.0 indicates that any
+    ----------
-    classifier as good as the dummy predictor is acceptable. A threshold of 1.0
+    classifier: Callable[[], ScikitLearnClassifier]
-    indicates that only classifier with a perfect cross-validation score are
+        A callable that constructs the candidate classifier.
-    acceptable. Other numbers are a linear interpolation of these two extremes.
+    threshold: float
        Number from zero to one indicating how well must the candidate classifier
        perform to be adopted. The threshold is specified in comparison to a dummy
        classifier trained on the same dataset. For example, a threshold of 0.0
        indicates that any classifier as good as the dummy predictor is acceptable. A
        threshold of 1.0 indicates that only classifiers with perfect
        cross-validation scores are acceptable. Other numbers are a linear
        interpolation of these two extremes.
    constant: Optional[List[bool]]
        If the candidate classifier fails to meet the threshold, use a dummy classifier
        which always returns this prediction instead. The list should have exactly as
        many elements as the number of columns of `x_train` provided to `fit`.
    cv: int
        Number of folds.
    scoring: str
        Scoring function.
    """
    def __init__(
        self,
-        classifier=LogisticRegression(),
+        classifier: Callable[[], ScikitLearnClassifier] = (
-        threshold=0.75,
+            lambda: ScikitLearnClassifier(LogisticRegression())
-        constant=0.0,
+        ),
-        cv=5,
+        threshold: float = 0.75,
-        scoring="accuracy",
+        constant: Optional[List[bool]] = None,
        cv: int = 5,
        scoring: str = "accuracy",
    ):
        """"""
        super().__init__()
-        self.classifier = None
+        if constant is None:
-        self.classifier_prototype = classifier
+            constant = [True, False]
-        self.constant = constant
+        self.n_classes = len(constant)
        self.classifier: Optional[ScikitLearnClassifier] = None
        self.classifier_factory = classifier
        self.constant: List[bool] = constant
        self.threshold = threshold
        self.cv = cv
        self.scoring = scoring
-    def fit(self, x_train, y_train):
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
-        # super().fit(x_train, y_train)
+        super().fit(x_train, y_train)
        (n_samples, n_classes) = x_train.shape
        assert n_classes == self.n_classes
        # Calculate dummy score and absolute score threshold
        y_train_avg = np.average(y_train)
@@ -54,13 +77,20 @@ class CrossValidatedClassifier(Classifier):
        absolute_threshold = 1.0 * self.threshold + dummy_score * (1 - self.threshold)
        # Calculate cross validation score and decide which classifier to use
-        clf = deepcopy(self.classifier_prototype)
+        clf = self.classifier_factory()
        assert clf is not None
        assert isinstance(clf, ScikitLearnClassifier), (
            f"The provided classifier callable must return a ScikitLearnClassifier. "
            f"Found {clf.__class__.__name__} instead. If this is a scikit-learn "
            f"classifier, you must wrap it with ScikitLearnClassifier."
        )
        cv_score = float(
            np.mean(
                cross_val_score(
-                    clf,
+                    clf.inner_clf,
                    x_train,
-                    y_train,
+                    y_train[:, 1],
                    cv=self.cv,
                    scoring=self.scoring,
                )
@@ -77,14 +107,19 @@ class CrossValidatedClassifier(Classifier):
                "cv_score is below threshold (%.2f < %.2f); discarding"
                % (cv_score, absolute_threshold)
            )
-            self.classifier = DummyClassifier(
+            self.classifier = ScikitLearnClassifier(
-                strategy="constant",
+                DummyClassifier(
-                constant=self.constant,
+                    strategy="constant",
                    constant=self.constant[1],
                )
            )
        # Train chosen classifier
        assert self.classifier is not None
        assert isinstance(self.classifier, ScikitLearnClassifier)
        self.classifier.fit(x_train, y_train)
-    def predict_proba(self, x_test):
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
-        # super().predict_proba(x_test)
+        super().predict_proba(x_test)
        assert self.classifier is not None
        return self.classifier.predict_proba(x_test)
--- a/tests/classifiers/test_counting.py
+++ b/tests/classifiers/test_counting.py
@@ -12,7 +12,27 @@ E = 0.1
 def test_counting():
    clf = CountingClassifier()
-    clf.fit(np.zeros((8, 25)), [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+    n_features = 25
-    expected_proba = np.array([[0.375, 0.625], [0.375, 0.625]])
+    x_train = np.zeros((8, n_features))
-    actual_proba = clf.predict_proba(np.zeros((2, 25)))
+    y_train = np.array(
-    assert norm(actual_proba - expected_proba) < E
+        [
            [True, False, False],
            [True, False, False],
            [False, True, False],
            [True, False, False],
            [False, True, False],
            [False, True, False],
            [False, True, False],
            [False, False, True],
        ]
    )
    x_test = np.zeros((2, n_features))
    y_expected = np.array(
        [
            [3 / 8.0, 4 / 8.0, 1 / 8.0],
            [3 / 8.0, 4 / 8.0, 1 / 8.0],
        ]
    )
    clf.fit(x_train, y_train)
    y_actual = clf.predict_proba(x_test)
    assert norm(y_actual - y_expected) < E
--- a/tests/classifiers/test_cv.py
+++ b/tests/classifiers/test_cv.py
@@ -7,20 +7,37 @@ from numpy.linalg import norm
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
 from miplearn.classifiers import ScikitLearnClassifier
 from miplearn.classifiers.cv import CrossValidatedClassifier
 E = 0.1
-def test_cv():
+def test_cv() -> None:
    # Training set: label is true if point is inside a 2D circle
-    x_train = np.array([[x1, x2] for x1 in range(-10, 11) for x2 in range(-10, 11)])
+    x_train = np.array(
        [
            [
                x1,
                x2,
            ]
            for x1 in range(-10, 11)
            for x2 in range(-10, 11)
        ]
    )
    x_train = StandardScaler().fit_transform(x_train)
    n_samples = x_train.shape[0]
    y_train = np.array(
        [
-            1.0 if x1 * x1 + x2 * x2 <= 100 else 0.0
+            [
                False,
                True,
            ]
            if x1 * x1 + x2 * x2 <= 100
            else [
                True,
                False,
            ]
            for x1 in range(-10, 11)
            for x2 in range(-10, 11)
        ]
@@ -29,24 +46,39 @@ def test_cv():
    # Support vector machines with linear kernels do not perform well on this
    # data set, so predictor should return the given constant.
    clf = CrossValidatedClassifier(
-        classifier=SVC(probability=True, random_state=42),
+        classifier=lambda: ScikitLearnClassifier(
            SVC(
                probability=True,
                random_state=42,
            )
        ),
        threshold=0.90,
-        constant=0.0,
+        constant=[True, False],
        cv=30,
    )
    clf.fit(x_train, y_train)
    proba = clf.predict_proba(x_train)
    assert isinstance(proba, np.ndarray)
    assert proba.shape == (n_samples, 2)
    y_pred = (proba[:, 1] > 0.5).astype(float)
    assert norm(np.zeros(n_samples) - y_pred) < E
    # Support vector machines with quadratic kernels perform almost perfectly
    # on this data set, so predictor should return their prediction.
    clf = CrossValidatedClassifier(
-        classifier=SVC(probability=True, kernel="poly", degree=2, random_state=42),
+        classifier=lambda: ScikitLearnClassifier(
            SVC(
                probability=True,
                kernel="poly",
                degree=2,
                random_state=42,
            )
        ),
        threshold=0.90,
        cv=30,
    )
    clf.fit(x_train, y_train)
    proba = clf.predict_proba(x_train)
    y_pred = (proba[:, 1] > 0.5).astype(float)
-    assert norm(y_train - y_pred) < E
+    assert norm(y_train[:, 1] - y_pred) < E