From eb9a677136eec77639540970dfc0405bfe4a980f Mon Sep 17 00:00:00 2001 From: Alinson S Xavier Date: Wed, 22 Jan 2020 16:56:47 -0600 Subject: [PATCH] Implement LogisticWarmStartPredicitor with tests --- miplearn/solvers.py | 9 +-- miplearn/tests/test_warmstart_logistic.py | 64 +++++++++++++++++ miplearn/warmstart.py | 85 ++++++++++++++++------- 3 files changed, 127 insertions(+), 31 deletions(-) create mode 100644 miplearn/tests/test_warmstart_logistic.py diff --git a/miplearn/solvers.py b/miplearn/solvers.py index e28089b..0edb6d7 100644 --- a/miplearn/solvers.py +++ b/miplearn/solvers.py @@ -2,9 +2,8 @@ # Copyright (C) 2019-2020 Argonne National Laboratory. All rights reserved. # Written by Alinson S. Xavier -# from .warmstart import WarmStartPredictor from .transformers import PerVariableTransformer -from .warmstart import WarmStartPredictor +from .warmstart import LogisticWarmStartPredictor import pyomo.environ as pe import numpy as np @@ -17,9 +16,11 @@ class LearningSolver: def __init__(self, threads=4, - parent_solver=pe.SolverFactory('cbc')): + parent_solver=pe.SolverFactory('cbc'), + ws_predictor_factory=LogisticWarmStartPredictor): self.parent_solver = parent_solver self.parent_solver.options["threads"] = threads + self.ws_predictor_factory = ws_predictor_factory self.x_train = {} self.y_train = {} self.ws_predictors = {} @@ -75,7 +76,7 @@ class LearningSolver: for category in x_train_dict.keys(): x_train = x_train_dict[category] y_train = y_train_dict[category] - self.ws_predictors[category] = WarmStartPredictor() + self.ws_predictors[category] = self.ws_predictor_factory() self.ws_predictors[category].fit(x_train, y_train) def _solve(self, model, tee=False): diff --git a/miplearn/tests/test_warmstart_logistic.py b/miplearn/tests/test_warmstart_logistic.py new file mode 100644 index 0000000..bfbe85b --- /dev/null +++ b/miplearn/tests/test_warmstart_logistic.py @@ -0,0 +1,64 @@ +# MIPLearn: A Machine-Learning Framework for Mixed-Integer Optimization +# Copyright (C) 2019-2020 Argonne National Laboratory. All rights reserved. +# Written by Alinson S. Xavier + +from miplearn.warmstart import LogisticWarmStartPredictor +from sklearn.metrics import accuracy_score, precision_score +import numpy as np + + +def _generate_dataset(ground_truth, n_samples=10_000): + x_train = np.random.rand(n_samples,5) + x_test = np.random.rand(n_samples,5) + y_train = ground_truth(x_train) + y_test = ground_truth(x_test) + return x_train, y_train, x_test, y_test + + +def _is_sum_greater_than_two(x): + y = (np.sum(x, axis=1) > 2.0).astype(int) + return np.vstack([y, 1 - y]).transpose() + + +def _always_zero(x): + y = np.zeros((1, x.shape[0])) + return np.vstack([y, 1 - y]).transpose() + + +def _random_values(x): + y = np.random.randint(2, size=x.shape[0]) + return np.vstack([y, 1 - y]).transpose() + + +def test_logistic_ws_with_balanced_labels(): + x_train, y_train, x_test, y_test = _generate_dataset(_is_sum_greater_than_two) + ws = LogisticWarmStartPredictor() + ws.fit(x_train, y_train) + y_pred = ws.predict(x_test) + assert accuracy_score(y_test[:,0], y_pred[:,0]) > 0.99 + assert accuracy_score(y_test[:,1], y_pred[:,1]) > 0.99 + + +def test_logistic_ws_with_unbalanced_labels(): + x_train, y_train, x_test, y_test = _generate_dataset(_always_zero) + ws = LogisticWarmStartPredictor() + ws.fit(x_train, y_train) + y_pred = ws.predict(x_test) + assert accuracy_score(y_test[:,0], y_pred[:,0]) == 1.0 + assert accuracy_score(y_test[:,1], y_pred[:,1]) == 1.0 + + +def test_logistic_ws_with_unpredictable_labels(): + x_train, y_train, x_test, y_test = _generate_dataset(_random_values) + ws = LogisticWarmStartPredictor() + ws.fit(x_train, y_train) + y_pred = ws.predict(x_test) + assert np.sum(y_pred) == 0 + + +def test_logistic_ws_with_small_sample_size(): + x_train, y_train, x_test, y_test = _generate_dataset(_random_values, n_samples=3) + ws = LogisticWarmStartPredictor() + ws.fit(x_train, y_train) + y_pred = ws.predict(x_test) + assert np.sum(y_pred) == 0 diff --git a/miplearn/warmstart.py b/miplearn/warmstart.py index 7156919..4ebb594 100644 --- a/miplearn/warmstart.py +++ b/miplearn/warmstart.py @@ -2,42 +2,73 @@ # Copyright (C) 2019-2020 Argonne National Laboratory. All rights reserved. # Written by Alinson S. Xavier +from abc import ABC, abstractmethod import numpy as np from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score - -class WarmStartPredictor: - def __init__(self, - thr_fix_zero=0.05, - thr_fix_one=0.95, - thr_predict=0.95): - self.model = None - self.thr_predict = thr_predict - self.thr_fix_zero = thr_fix_zero - self.thr_fix_one = thr_fix_one - +class WarmStartPredictor(ABC): + def __init__(self): + self.models = [None, None] + def fit(self, x_train, y_train): assert isinstance(x_train, np.ndarray) assert isinstance(y_train, np.ndarray) - assert y_train.shape[1] == 2 assert y_train.shape[0] == x_train.shape[0] - y_hat = np.average(y_train[:, 1]) - if y_hat < self.thr_fix_zero or y_hat > self.thr_fix_one: - self.model = int(y_hat) - else: - self.model = make_pipeline(StandardScaler(), LogisticRegression()) - self.model.fit(x_train, y_train[:, 1].astype(int)) + assert y_train.shape[1] == 2 + for i in [0,1]: + self.models[i] = self._fit(x_train, y_train[:, i], i) def predict(self, x_test): assert isinstance(x_test, np.ndarray) - if isinstance(self.model, int): - p_test = np.array([[1 - self.model, self.model] - for _ in range(x_test.shape[0])]) - else: - p_test = self.model.predict_proba(x_test) - p_test[p_test < self.thr_predict] = 0 - p_test[p_test > 0] = 1 - p_test = p_test.astype(int) - return p_test + y_pred = np.zeros((x_test.shape[0], 2), dtype=np.int) + for i in [0,1]: + if isinstance(self.models[i], int): + y_pred[:, i] = self.models[i] + else: + y_pred[:, i] = self.models[i].predict(x_test) + return y_pred + + @abstractmethod + def _fit(self, x_train, y_train, label): + pass + + +class LogisticWarmStartPredictor(WarmStartPredictor): + def __init__(self, + min_samples=100, + thr_fix=[0.99, 0.99], + thr_balance=[0.95, 0.95], + thr_score=[0.95, 0.95]): + super().__init__() + self.min_samples = min_samples + self.thr_fix = thr_fix + self.thr_balance = thr_balance + self.thr_score = thr_score + + def _fit(self, x_train, y_train, label): + y_train_avg = np.average(y_train) + + # If number of samples is too small, don't predict anything. + if x_train.shape[0] < self.min_samples: + return 0 + + # If vast majority of observations are true, always return true. + if y_train_avg > self.thr_fix[label]: + return 1 + + # If dataset is not balanced enough, don't predict anything. + if y_train_avg < (1 - self.thr_balance[label]) or y_train_avg > self.thr_balance[label]: + return 0 + + reg = make_pipeline(StandardScaler(), LogisticRegression()) + reg_score = np.mean(cross_val_score(reg, x_train, y_train, cv=5)) + + # If cross-validation score is too low, don't predict anything. + if reg_score < self.thr_score[label]: + return 0 + + reg.fit(x_train, y_train.astype(int)) + return reg \ No newline at end of file