From eb9a677136eec77639540970dfc0405bfe4a980f Mon Sep 17 00:00:00 2001
From: Alinson S Xavier <axavier@anl.gov>
Date: Wed, 22 Jan 2020 16:56:47 -0600
Subject: [PATCH] Implement LogisticWarmStartPredicitor with tests

---
 miplearn/solvers.py                       |  9 +--
 miplearn/tests/test_warmstart_logistic.py | 64 +++++++++++++++++
 miplearn/warmstart.py                     | 85 ++++++++++++++++-------
 3 files changed, 127 insertions(+), 31 deletions(-)
 create mode 100644 miplearn/tests/test_warmstart_logistic.py

diff --git a/miplearn/solvers.py b/miplearn/solvers.py
index e28089b..0edb6d7 100644
--- a/miplearn/solvers.py
+++ b/miplearn/solvers.py
@@ -2,9 +2,8 @@
 # Copyright (C) 2019-2020 Argonne National Laboratory. All rights reserved.
 # Written by Alinson S. Xavier <axavier@anl.gov>
 
-# from .warmstart import WarmStartPredictor
 from .transformers import PerVariableTransformer
-from .warmstart import WarmStartPredictor
+from .warmstart import LogisticWarmStartPredictor
 import pyomo.environ as pe
 import numpy as np
 
@@ -17,9 +16,11 @@ class LearningSolver:
 
     def __init__(self,
                  threads=4,
-                 parent_solver=pe.SolverFactory('cbc')):
+                 parent_solver=pe.SolverFactory('cbc'),
+                 ws_predictor_factory=LogisticWarmStartPredictor):
         self.parent_solver = parent_solver
         self.parent_solver.options["threads"] = threads
+        self.ws_predictor_factory = ws_predictor_factory
         self.x_train = {}
         self.y_train = {}
         self.ws_predictors = {}
@@ -75,7 +76,7 @@ class LearningSolver:
         for category in x_train_dict.keys():
             x_train = x_train_dict[category]
             y_train = y_train_dict[category]
-            self.ws_predictors[category] = WarmStartPredictor()
+            self.ws_predictors[category] = self.ws_predictor_factory()
             self.ws_predictors[category].fit(x_train, y_train)
 
     def _solve(self, model, tee=False):
diff --git a/miplearn/tests/test_warmstart_logistic.py b/miplearn/tests/test_warmstart_logistic.py
new file mode 100644
index 0000000..bfbe85b
--- /dev/null
+++ b/miplearn/tests/test_warmstart_logistic.py
@@ -0,0 +1,64 @@
+# MIPLearn: A Machine-Learning Framework for Mixed-Integer Optimization
+# Copyright (C) 2019-2020 Argonne National Laboratory. All rights reserved.
+# Written by Alinson S. Xavier <axavier@anl.gov>
+
+from miplearn.warmstart import LogisticWarmStartPredictor
+from sklearn.metrics import accuracy_score, precision_score
+import numpy as np
+
+
+def _generate_dataset(ground_truth, n_samples=10_000):
+    x_train = np.random.rand(n_samples,5)
+    x_test = np.random.rand(n_samples,5)
+    y_train = ground_truth(x_train)
+    y_test = ground_truth(x_test)
+    return x_train, y_train, x_test, y_test
+
+
+def _is_sum_greater_than_two(x):
+    y = (np.sum(x, axis=1) > 2.0).astype(int)
+    return np.vstack([y, 1 - y]).transpose()
+
+
+def _always_zero(x):
+    y = np.zeros((1, x.shape[0]))
+    return np.vstack([y, 1 - y]).transpose()
+
+
+def _random_values(x):
+    y = np.random.randint(2, size=x.shape[0])
+    return np.vstack([y, 1 - y]).transpose()
+    
+    
+def test_logistic_ws_with_balanced_labels():
+    x_train, y_train, x_test, y_test = _generate_dataset(_is_sum_greater_than_two)
+    ws = LogisticWarmStartPredictor()
+    ws.fit(x_train, y_train)
+    y_pred = ws.predict(x_test)
+    assert accuracy_score(y_test[:,0], y_pred[:,0]) > 0.99
+    assert accuracy_score(y_test[:,1], y_pred[:,1]) > 0.99
+    
+    
+def test_logistic_ws_with_unbalanced_labels():
+    x_train, y_train, x_test, y_test = _generate_dataset(_always_zero)
+    ws = LogisticWarmStartPredictor()
+    ws.fit(x_train, y_train)
+    y_pred = ws.predict(x_test)
+    assert accuracy_score(y_test[:,0], y_pred[:,0]) == 1.0
+    assert accuracy_score(y_test[:,1], y_pred[:,1]) == 1.0
+
+    
+def test_logistic_ws_with_unpredictable_labels():
+    x_train, y_train, x_test, y_test = _generate_dataset(_random_values)
+    ws = LogisticWarmStartPredictor()
+    ws.fit(x_train, y_train)
+    y_pred = ws.predict(x_test)
+    assert np.sum(y_pred) == 0
+
+    
+def test_logistic_ws_with_small_sample_size():
+    x_train, y_train, x_test, y_test = _generate_dataset(_random_values, n_samples=3)
+    ws = LogisticWarmStartPredictor()
+    ws.fit(x_train, y_train)
+    y_pred = ws.predict(x_test)
+    assert np.sum(y_pred) == 0
diff --git a/miplearn/warmstart.py b/miplearn/warmstart.py
index 7156919..4ebb594 100644
--- a/miplearn/warmstart.py
+++ b/miplearn/warmstart.py
@@ -2,42 +2,73 @@
 # Copyright (C) 2019-2020 Argonne National Laboratory. All rights reserved.
 # Written by Alinson S. Xavier <axavier@anl.gov>
 
+from abc import ABC, abstractmethod
 import numpy as np
 from sklearn.pipeline import make_pipeline
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import cross_val_score
 
-
-class WarmStartPredictor:
-    def __init__(self,
-                 thr_fix_zero=0.05,
-                 thr_fix_one=0.95,
-                 thr_predict=0.95):
-        self.model = None
-        self.thr_predict = thr_predict
-        self.thr_fix_zero = thr_fix_zero
-        self.thr_fix_one = thr_fix_one
-
+class WarmStartPredictor(ABC):
+    def __init__(self):
+        self.models = [None, None]
+        
     def fit(self, x_train, y_train):
         assert isinstance(x_train, np.ndarray)
         assert isinstance(y_train, np.ndarray)
-        assert y_train.shape[1] == 2
         assert y_train.shape[0] == x_train.shape[0]
-        y_hat = np.average(y_train[:, 1])
-        if y_hat < self.thr_fix_zero or y_hat > self.thr_fix_one:
-            self.model = int(y_hat)
-        else:
-            self.model = make_pipeline(StandardScaler(), LogisticRegression())
-            self.model.fit(x_train, y_train[:, 1].astype(int))
+        assert y_train.shape[1] == 2
+        for i in [0,1]:
+            self.models[i] = self._fit(x_train, y_train[:, i],  i)
 
     def predict(self, x_test):
         assert isinstance(x_test, np.ndarray)
-        if isinstance(self.model, int):
-            p_test = np.array([[1 - self.model, self.model]
-                               for _ in range(x_test.shape[0])])
-        else:
-            p_test = self.model.predict_proba(x_test)
-        p_test[p_test < self.thr_predict] = 0
-        p_test[p_test > 0] = 1
-        p_test = p_test.astype(int)
-        return p_test
+        y_pred = np.zeros((x_test.shape[0], 2), dtype=np.int)
+        for i in [0,1]:
+            if isinstance(self.models[i], int):
+                y_pred[:, i] = self.models[i]
+            else:
+                y_pred[:, i] = self.models[i].predict(x_test)
+        return y_pred
+
+    @abstractmethod
+    def _fit(self, x_train, y_train, label):
+        pass
+
+
+class LogisticWarmStartPredictor(WarmStartPredictor):
+    def __init__(self,
+                 min_samples=100,
+                 thr_fix=[0.99, 0.99],
+                 thr_balance=[0.95, 0.95],
+                 thr_score=[0.95, 0.95]):
+        super().__init__()
+        self.min_samples = min_samples
+        self.thr_fix = thr_fix
+        self.thr_balance = thr_balance
+        self.thr_score = thr_score
+
+    def _fit(self, x_train, y_train, label):
+        y_train_avg = np.average(y_train)
+
+        # If number of samples is too small, don't predict anything.
+        if x_train.shape[0] < self.min_samples:
+            return 0
+        
+        # If vast majority of observations are true, always return true.
+        if y_train_avg > self.thr_fix[label]:
+            return 1
+        
+        # If dataset is not balanced enough, don't predict anything.
+        if y_train_avg < (1 - self.thr_balance[label]) or y_train_avg > self.thr_balance[label]:
+            return 0
+            
+        reg = make_pipeline(StandardScaler(), LogisticRegression())
+        reg_score = np.mean(cross_val_score(reg, x_train, y_train, cv=5))
+
+        # If cross-validation score is too low, don't predict anything.
+        if reg_score < self.thr_score[label]:
+            return 0
+        
+        reg.fit(x_train, y_train.astype(int))
+        return reg
\ No newline at end of file