From 677c3540f10a038277699030dff9249a42f0bb58 Mon Sep 17 00:00:00 2001
From: Alinson S Xavier <axavier@anl.gov>
Date: Tue, 14 Apr 2020 19:33:33 -0500
Subject: [PATCH] Implement CrossValidatedClassifier

---
 src/python/miplearn/classifiers/__init__.py   |  8 +++
 src/python/miplearn/classifiers/cv.py         | 71 +++++++++++++++++++
 .../miplearn/classifiers/tests/test_cv.py     | 46 ++++++++++++
 3 files changed, 125 insertions(+)
 create mode 100644 src/python/miplearn/classifiers/cv.py
 create mode 100644 src/python/miplearn/classifiers/tests/test_cv.py

diff --git a/src/python/miplearn/classifiers/__init__.py b/src/python/miplearn/classifiers/__init__.py
index 5f1d883..a8dc64b 100644
--- a/src/python/miplearn/classifiers/__init__.py
+++ b/src/python/miplearn/classifiers/__init__.py
@@ -4,6 +4,8 @@
 
 from abc import ABC, abstractmethod
 
+import numpy as np
+
 
 class Classifier(ABC):
     @abstractmethod
@@ -14,6 +16,12 @@ class Classifier(ABC):
     def predict_proba(self, x_test):
         pass
 
+    def predict(self, x_test):
+        proba = self.predict_proba(x_test)
+        assert isinstance(proba, np.ndarray)
+        assert proba.shape == (x_test.shape[0], 2)
+        return (proba[:, 1] > 0.5).astype(float)
+
 
 class Regressor(ABC):
     @abstractmethod
diff --git a/src/python/miplearn/classifiers/cv.py b/src/python/miplearn/classifiers/cv.py
new file mode 100644
index 0000000..01d9c8f
--- /dev/null
+++ b/src/python/miplearn/classifiers/cv.py
@@ -0,0 +1,71 @@
+#  MIPLearn: Extensible Framework for Learning-Enhanced Mixed-Integer Optimization
+#  Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
+#  Released under the modified BSD license. See COPYING.md for more details.
+
+from copy import deepcopy
+
+import numpy as np
+from miplearn.classifiers import Classifier
+from sklearn.dummy import DummyClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import cross_val_score
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+class CrossValidatedClassifier(Classifier):
+    """
+    A meta-classifier that, upon training, evaluates the performance of another
+    classifier on the training data set using k-fold cross validation, then
+    either adopts the other classifier it if the cv-score is high enough, or
+    returns a constant label for every x_test otherwise.
+
+    The threshold is specified in comparison to a dummy classifier trained
+    on the same dataset. For example, a threshold of 0.0 indicates that any
+    classifier as good as the dummy predictor is acceptable. A threshold of 1.0
+    indicates that only classifier with a perfect cross-validation score are
+    acceptable. Other numbers are a linear interpolation of these two extremes.
+    """
+
+    def __init__(self,
+                 classifier=LogisticRegression(),
+                 threshold=0.75,
+                 constant=0.0,
+                 cv=5,
+                 scoring='accuracy'):
+        self.classifier = None
+        self.classifier_prototype = classifier
+        self.constant = constant
+        self.threshold = threshold
+        self.cv = cv
+        self.scoring = scoring
+
+    def fit(self, x_train, y_train):
+        # Calculate dummy score and absolute score threshold
+        y_train_avg = np.average(y_train)
+        dummy_score = max(y_train_avg, 1 - y_train_avg)
+        absolute_threshold = 1. * self.threshold + dummy_score * (1 - self.threshold)
+
+        # Calculate cross validation score and decide which classifier to use
+        clf = deepcopy(self.classifier_prototype)
+        cv_score = float(np.mean(cross_val_score(clf,
+                                                 x_train,
+                                                 y_train,
+                                                 cv=self.cv,
+                                                 scoring=self.scoring)))
+        if cv_score >= absolute_threshold:
+            logger.debug("cv_score is above threshold (%.2f >= %.2f); keeping" %
+                         (cv_score, absolute_threshold))
+            self.classifier = clf
+        else:
+            logger.debug("cv_score is below threshold (%.2f < %.2f); discarding" %
+                         (cv_score, absolute_threshold))
+            self.classifier = DummyClassifier(strategy="constant",
+                                              constant=self.constant)
+
+        # Train chosen classifier
+        self.classifier.fit(x_train, y_train)
+
+    def predict_proba(self, x_test):
+        return self.classifier.predict_proba(x_test)
diff --git a/src/python/miplearn/classifiers/tests/test_cv.py b/src/python/miplearn/classifiers/tests/test_cv.py
new file mode 100644
index 0000000..87c9fa8
--- /dev/null
+++ b/src/python/miplearn/classifiers/tests/test_cv.py
@@ -0,0 +1,46 @@
+#  MIPLearn: Extensible Framework for Learning-Enhanced Mixed-Integer Optimization
+#  Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
+#  Released under the modified BSD license. See COPYING.md for more details.
+
+import numpy as np
+from miplearn.classifiers.cv import CrossValidatedClassifier
+from numpy.linalg import norm
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+
+E = 0.1
+
+
+def test_cv():
+    # Training set: label is true if point is inside a 2D circle
+    x_train = np.array([[x1, x2]
+                        for x1 in range(-10, 11)
+                        for x2 in range(-10, 11)])
+    x_train = StandardScaler().fit_transform(x_train)
+    n_samples = x_train.shape[0]
+
+    y_train = np.array([1.0 if x1*x1 + x2*x2 <= 100 else 0.0
+                        for x1 in range(-10, 11)
+                        for x2 in range(-10, 11)])
+
+    # Support vector machines with linear kernels do not perform well on this
+    # data set, so predictor should return the given constant.
+    clf = CrossValidatedClassifier(classifier=SVC(probability=True,
+                                                  random_state=42),
+                                   threshold=0.90,
+                                   constant=0.0,
+                                   cv=30)
+    clf.fit(x_train, y_train)
+    assert norm(np.zeros(n_samples) - clf.predict(x_train)) < E
+
+    # Support vector machines with quadratic kernels perform almost perfectly
+    # on this data set, so predictor should return their prediction.
+    clf = CrossValidatedClassifier(classifier=SVC(probability=True,
+                                                  kernel='poly',
+                                                  degree=2,
+                                                  random_state=42),
+                                   threshold=0.90,
+                                   cv=30)
+    clf.fit(x_train, y_train)
+    print(y_train - clf.predict(x_train))
+    assert norm(y_train - clf.predict(x_train)) < E