From 522f3a7e180631d2cbf5ca2130b78991d93675b0 Mon Sep 17 00:00:00 2001
From: "Alinson S. Xavier" <git@axavier.org>
Date: Tue, 22 Feb 2022 15:21:56 -0600
Subject: [PATCH] Change LearningSolver.solve and fit

---
 miplearn/problems/stab.py                  |  14 ++
 miplearn/solvers/learning.py               | 156 ++++++++++++---------
 tests/components/test_dynamic_user_cuts.py |   6 +-
 tests/components/test_objective.py         |   6 +-
 tests/components/test_primal.py            |   6 +-
 tests/instance/test_file.py                |   2 +-
 tests/problems/test_knapsack.py            |   2 +-
 tests/problems/test_stab.py                |   2 +-
 tests/problems/test_tsp.py                 |   8 +-
 tests/solvers/test_learning_solver.py      |  62 +++++---
 10 files changed, 157 insertions(+), 107 deletions(-)

diff --git a/miplearn/problems/stab.py b/miplearn/problems/stab.py
index 97e5559..caa74c2 100644
--- a/miplearn/problems/stab.py
+++ b/miplearn/problems/stab.py
@@ -131,3 +131,17 @@ class MaxWeightStableSetGenerator:
 
     def _generate_graph(self) -> Graph:
         return nx.generators.random_graphs.binomial_graph(self.n.rvs(), self.p.rvs())
+
+
+def build_stab_model(data: MaxWeightStableSetData) -> pe.ConcreteModel:
+    model = pe.ConcreteModel()
+    nodes = list(data.graph.nodes)
+    model.x = pe.Var(nodes, domain=pe.Binary)
+    model.OBJ = pe.Objective(
+        expr=sum(model.x[v] * data.weights[v] for v in nodes),
+        sense=pe.maximize,
+    )
+    model.clique_eqs = pe.ConstraintList()
+    for clique in nx.find_cliques(data.graph):
+        model.clique_eqs.add(sum(model.x[v] for v in clique) <= 1)
+    return model
diff --git a/miplearn/solvers/learning.py b/miplearn/solvers/learning.py
index 753a228..1232a14 100644
--- a/miplearn/solvers/learning.py
+++ b/miplearn/solvers/learning.py
@@ -5,10 +5,12 @@
 import logging
 import time
 import traceback
-from typing import Optional, List, Any, cast, Dict, Tuple
+from typing import Optional, List, Any, cast, Dict, Tuple, Callable, IO
 
+from overrides import overrides
 from p_tqdm import p_map
 
+from miplearn.features.sample import Hdf5Sample, Sample
 from miplearn.components.component import Component
 from miplearn.components.dynamic_lazy import DynamicLazyConstraintsComponent
 from miplearn.components.dynamic_user_cuts import UserCutsComponent
@@ -16,15 +18,44 @@ from miplearn.components.objective import ObjectiveValueComponent
 from miplearn.components.primal import PrimalSolutionComponent
 from miplearn.features.extractor import FeaturesExtractor
 from miplearn.instance.base import Instance
-from miplearn.instance.picklegz import PickleGzInstance
 from miplearn.solvers import _RedirectOutput
 from miplearn.solvers.internal import InternalSolver
 from miplearn.solvers.pyomo.gurobi import GurobiPyomoSolver
 from miplearn.types import LearningSolveStats
+import gzip
+import pickle
+from os.path import exists
 
 logger = logging.getLogger(__name__)
 
 
+class InstanceWrapper(Instance):
+    def __init__(self, data_filename: Any, build_model: Callable):
+        super().__init__()
+        assert data_filename.endswith(".pkl.gz")
+        self.filename = data_filename
+        self.sample_filename = data_filename.replace(".pkl.gz", ".h5")
+        self.sample = Hdf5Sample(
+            self.sample_filename,
+            mode="r+" if exists(self.sample_filename) else "w",
+        )
+        self.build_model = build_model
+
+    @overrides
+    def to_model(self) -> Any:
+        with gzip.GzipFile(self.filename, "rb") as file:
+            data = pickle.load(cast(IO[bytes], file))
+            return self.build_model(data)
+
+    @overrides
+    def create_sample(self) -> Sample:
+        return self.sample
+
+    @overrides
+    def get_samples(self) -> List[Sample]:
+        return [self.sample]
+
+
 class _GlobalVariables:
     def __init__(self) -> None:
         self.solver: Optional[LearningSolver] = None
@@ -47,7 +78,7 @@ def _parallel_solve(
     assert solver is not None
     assert instances is not None
     try:
-        stats = solver.solve(
+        stats = solver._solve(
             instances[idx],
             discard_output=discard_outputs,
         )
@@ -86,11 +117,6 @@ class LearningSolver:
         option should be activated if the LP relaxation is not very
         expensive to solve and if it provides good hints for the integer
         solution.
-    simulate_perfect: bool
-        If true, each call to solve actually performs three actions: solve
-        the original problem, train the ML models on the data that was just
-        collected, and solve the problem again. This is useful for evaluating
-        the theoretical performance of perfect ML models.
     """
 
     def __init__(
@@ -100,7 +126,6 @@ class LearningSolver:
         solver: Optional[InternalSolver] = None,
         use_lazy_cb: bool = False,
         solve_lp: bool = True,
-        simulate_perfect: bool = False,
         extractor: Optional[FeaturesExtractor] = None,
         extract_lhs: bool = True,
         extract_sa: bool = True,
@@ -117,7 +142,6 @@ class LearningSolver:
         self.internal_solver: Optional[InternalSolver] = None
         self.internal_solver_prototype: InternalSolver = solver
         self.mode: str = mode
-        self.simulate_perfect: bool = simulate_perfect
         self.solve_lp: bool = solve_lp
         self.tee = False
         self.use_lazy_cb: bool = use_lazy_cb
@@ -139,6 +163,44 @@ class LearningSolver:
         discard_output: bool = False,
         tee: bool = False,
     ) -> LearningSolveStats:
+        """
+        Solves the given instance. If trained machine-learning models are
+        available, they will be used to accelerate the solution process.
+
+        The argument `instance` may be either an Instance object or a
+        filename pointing to a pickled Instance object.
+
+        This method adds a new training sample to `instance.training_sample`.
+        If a filename is provided, then the file is modified in-place. That is,
+        the original file is overwritten.
+
+        If `solver.solve_lp_first` is False, the properties lp_solution and
+        lp_value will be set to dummy values.
+
+        Parameters
+        ----------
+        instance: Instance
+            The instance to be solved.
+        model: Any
+            The corresponding Pyomo model. If not provided, it will be created.
+        discard_output: bool
+            If True, do not write the modified instances anywhere; simply discard
+            them. Useful during benchmarking.
+        tee: bool
+            If true, prints solver log to screen.
+
+        Returns
+        -------
+        LearningSolveStats
+            A dictionary of solver statistics containing at least the following
+            keys: "Lower bound", "Upper bound", "Wallclock time", "Nodes",
+            "Sense", "Log", "Warm start value" and "LP value".
+
+            Additional components may generate additional keys. For example,
+            ObjectiveValueComponent adds the keys "Predicted LB" and
+            "Predicted UB". See the documentation of each component for more
+            details.
+        """
 
         # Generate model
         # -------------------------------------------------------
@@ -299,65 +361,19 @@ class LearningSolver:
 
     def solve(
         self,
-        instance: Instance,
-        model: Any = None,
-        discard_output: bool = False,
-        tee: bool = False,
-    ) -> LearningSolveStats:
-        """
-        Solves the given instance. If trained machine-learning models are
-        available, they will be used to accelerate the solution process.
-
-        The argument `instance` may be either an Instance object or a
-        filename pointing to a pickled Instance object.
-
-        This method adds a new training sample to `instance.training_sample`.
-        If a filename is provided, then the file is modified in-place. That is,
-        the original file is overwritten.
-
-        If `solver.solve_lp_first` is False, the properties lp_solution and
-        lp_value will be set to dummy values.
-
-        Parameters
-        ----------
-        instance: Instance
-            The instance to be solved.
-        model: Any
-            The corresponding Pyomo model. If not provided, it will be created.
-        discard_output: bool
-            If True, do not write the modified instances anywhere; simply discard
-            them. Useful during benchmarking.
-        tee: bool
-            If true, prints solver log to screen.
-
-        Returns
-        -------
-        LearningSolveStats
-            A dictionary of solver statistics containing at least the following
-            keys: "Lower bound", "Upper bound", "Wallclock time", "Nodes",
-            "Sense", "Log", "Warm start value" and "LP value".
+        filenames: List[str],
+        build_model: Callable,
+        tee: bool = True,
+    ) -> List[LearningSolveStats]:
+        stats = []
+        for f in filenames:
+            s = self._solve(InstanceWrapper(f, build_model), tee=tee)
+            stats.append(s)
+        return stats
 
-            Additional components may generate additional keys. For example,
-            ObjectiveValueComponent adds the keys "Predicted LB" and
-            "Predicted UB". See the documentation of each component for more
-            details.
-        """
-        if self.simulate_perfect:
-            if not isinstance(instance, PickleGzInstance):
-                raise Exception("Not implemented")
-            self._solve(
-                instance=instance,
-                model=model,
-                tee=tee,
-            )
-            self.fit([instance])
-            instance.instance = None
-        return self._solve(
-            instance=instance,
-            model=model,
-            discard_output=discard_output,
-            tee=tee,
-        )
+    def fit(self, filenames: List[str], build_model: Callable) -> None:
+        instances: List[Instance] = [InstanceWrapper(f, build_model) for f in filenames]
+        self._fit(instances)
 
     def parallel_solve(
         self,
@@ -394,7 +410,7 @@ class LearningSolver:
             `[solver.solve(p) for p in instances]`
         """
         if n_jobs == 1:
-            return [self.solve(p) for p in instances]
+            return [self._solve(p) for p in instances]
         else:
             self.internal_solver = None
             self._silence_miplearn_logger()
@@ -415,7 +431,7 @@ class LearningSolver:
             self._restore_miplearn_logger()
             return stats
 
-    def fit(
+    def _fit(
         self,
         training_instances: List[Instance],
         n_jobs: int = 1,
diff --git a/tests/components/test_dynamic_user_cuts.py b/tests/components/test_dynamic_user_cuts.py
index 10e688d..f8b3a5f 100644
--- a/tests/components/test_dynamic_user_cuts.py
+++ b/tests/components/test_dynamic_user_cuts.py
@@ -87,7 +87,7 @@ def test_usage(
     stab_instance: Instance,
     solver: LearningSolver,
 ) -> None:
-    stats_before = solver.solve(stab_instance)
+    stats_before = solver._solve(stab_instance)
     sample = stab_instance.get_samples()[0]
     user_cuts_encoded = sample.get_scalar("mip_user_cuts")
     assert user_cuts_encoded is not None
@@ -97,8 +97,8 @@ def test_usage(
     assert stats_before["UserCuts: Added ahead-of-time"] == 0
     assert stats_before["UserCuts: Added in callback"] > 0
 
-    solver.fit([stab_instance])
-    stats_after = solver.solve(stab_instance)
+    solver._fit([stab_instance])
+    stats_after = solver._solve(stab_instance)
     assert (
         stats_after["UserCuts: Added ahead-of-time"]
         == stats_before["UserCuts: Added in callback"]
diff --git a/tests/components/test_objective.py b/tests/components/test_objective.py
index fc45083..f81eb8d 100644
--- a/tests/components/test_objective.py
+++ b/tests/components/test_objective.py
@@ -134,8 +134,8 @@ def test_sample_evaluate(sample: Sample) -> None:
 def test_usage() -> None:
     solver = LearningSolver(components=[ObjectiveValueComponent()])
     instance = GurobiPyomoSolver().build_test_instance_knapsack()
-    solver.solve(instance)
-    solver.fit([instance])
-    stats = solver.solve(instance)
+    solver._solve(instance)
+    solver._fit([instance])
+    stats = solver._solve(instance)
     assert stats["mip_lower_bound"] == stats["Objective: Predicted lower bound"]
     assert stats["mip_upper_bound"] == stats["Objective: Predicted upper bound"]
diff --git a/tests/components/test_primal.py b/tests/components/test_primal.py
index 83b1096..aa6074a 100644
--- a/tests/components/test_primal.py
+++ b/tests/components/test_primal.py
@@ -110,9 +110,9 @@ def test_usage() -> None:
     gen = TravelingSalesmanGenerator(n=randint(low=5, high=6))
     data = gen.generate(1)
     instance = TravelingSalesmanInstance(data[0].n_cities, data[0].distances)
-    solver.solve(instance)
-    solver.fit([instance])
-    stats = solver.solve(instance)
+    solver._solve(instance)
+    solver._fit([instance])
+    stats = solver._solve(instance)
     assert stats["Primal: Free"] == 0
     assert stats["Primal: One"] + stats["Primal: Zero"] == 10
     assert stats["mip_lower_bound"] == stats["mip_warm_start_value"]
diff --git a/tests/instance/test_file.py b/tests/instance/test_file.py
index bad2fc5..446cb2e 100644
--- a/tests/instance/test_file.py
+++ b/tests/instance/test_file.py
@@ -22,7 +22,7 @@ def test_usage() -> None:
 
     # Solve instance from disk
     solver = LearningSolver(solver=GurobiSolver())
-    solver.solve(FileInstance(filename))
+    solver._solve(FileInstance(filename))
 
     # Assert HDF5 contains training data
     sample = FileInstance(filename).get_samples()[0]
diff --git a/tests/problems/test_knapsack.py b/tests/problems/test_knapsack.py
index 59561d0..760b58c 100644
--- a/tests/problems/test_knapsack.py
+++ b/tests/problems/test_knapsack.py
@@ -36,4 +36,4 @@ def test_knapsack() -> None:
         weights=data[0].weights,
     )
     solver = LearningSolver()
-    solver.solve(instance)
+    solver._solve(instance)
diff --git a/tests/problems/test_stab.py b/tests/problems/test_stab.py
index e04a5e0..27a2e78 100644
--- a/tests/problems/test_stab.py
+++ b/tests/problems/test_stab.py
@@ -15,7 +15,7 @@ def test_stab() -> None:
     weights = np.array([1.0, 1.0, 1.0, 1.0, 1.0])
     instance = MaxWeightStableSetInstance(graph, weights)
     solver = LearningSolver()
-    stats = solver.solve(instance)
+    stats = solver._solve(instance)
     assert stats["mip_lower_bound"] == 2.0
 
 
diff --git a/tests/problems/test_tsp.py b/tests/problems/test_tsp.py
index 5c6fbc8..f3cc510 100644
--- a/tests/problems/test_tsp.py
+++ b/tests/problems/test_tsp.py
@@ -40,7 +40,7 @@ def test_instance() -> None:
     )
     instance = TravelingSalesmanInstance(n_cities, distances)
     solver = LearningSolver()
-    solver.solve(instance)
+    solver._solve(instance)
     assert len(instance.get_samples()) == 1
     sample = instance.get_samples()[0]
     assert_equals(sample.get_array("mip_var_values"), [1.0, 0.0, 1.0, 1.0, 0.0, 1.0])
@@ -63,7 +63,7 @@ def test_subtour() -> None:
     distances = squareform(pdist(cities))
     instance = TravelingSalesmanInstance(n_cities, distances)
     solver = LearningSolver()
-    solver.solve(instance)
+    solver._solve(instance)
     samples = instance.get_samples()
     assert len(samples) == 1
     sample = samples[0]
@@ -96,5 +96,5 @@ def test_subtour() -> None:
             1.0,
         ],
     )
-    solver.fit([instance])
-    solver.solve(instance)
+    solver._fit([instance])
+    solver._solve(instance)
diff --git a/tests/solvers/test_learning_solver.py b/tests/solvers/test_learning_solver.py
index 97fcf47..02d08f6 100644
--- a/tests/solvers/test_learning_solver.py
+++ b/tests/solvers/test_learning_solver.py
@@ -5,19 +5,27 @@
 import logging
 import os
 import tempfile
+from os.path import exists
 from typing import List, cast
 
 import dill
+from scipy.stats import randint
 
+from miplearn.features.sample import Hdf5Sample
 from miplearn.instance.base import Instance
-from miplearn.instance.picklegz import PickleGzInstance, write_pickle_gz, read_pickle_gz
-from miplearn.solvers.gurobi import GurobiSolver
+from miplearn.instance.picklegz import (
+    PickleGzInstance,
+    write_pickle_gz,
+    read_pickle_gz,
+    save,
+)
+from miplearn.problems.stab import MaxWeightStableSetGenerator, build_stab_model
 from miplearn.solvers.internal import InternalSolver
 from miplearn.solvers.learning import LearningSolver
+from miplearn.solvers.tests import assert_equals
 
 # noinspection PyUnresolvedReferences
 from tests.solvers.test_internal_solver import internal_solvers
-from miplearn.solvers.tests import assert_equals
 
 logger = logging.getLogger(__name__)
 
@@ -34,7 +42,7 @@ def test_learning_solver(
                 mode=mode,
             )
 
-            solver.solve(instance)
+            solver._solve(instance)
             assert len(instance.get_samples()) > 0
             sample = instance.get_samples()[0]
 
@@ -55,8 +63,8 @@ def test_learning_solver(
             assert lp_log is not None
             assert len(lp_log) > 100
 
-            solver.fit([instance], n_jobs=4)
-            solver.solve(instance)
+            solver._fit([instance], n_jobs=4)
+            solver._solve(instance)
 
             # Assert solver is picklable
             with tempfile.TemporaryFile() as file:
@@ -73,9 +81,9 @@ def test_solve_without_lp(
             solver=internal_solver,
             solve_lp=False,
         )
-        solver.solve(instance)
-        solver.fit([instance])
-        solver.solve(instance)
+        solver._solve(instance)
+        solver._fit([instance])
+        solver._solve(instance)
 
 
 def test_parallel_solve(
@@ -104,7 +112,7 @@ def test_solve_fit_from_disk(
 
         # Test: solve
         solver = LearningSolver(solver=internal_solver)
-        solver.solve(instances[0])
+        solver._solve(instances[0])
         instance_loaded = read_pickle_gz(cast(PickleGzInstance, instances[0]).filename)
         assert len(instance_loaded.get_samples()) > 0
 
@@ -119,17 +127,29 @@ def test_solve_fit_from_disk(
             os.remove(cast(PickleGzInstance, instance).filename)
 
 
-def test_simulate_perfect() -> None:
-    internal_solver = GurobiSolver()
-    instance = internal_solver.build_test_instance_knapsack()
-    with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as tmp:
-        write_pickle_gz(instance, tmp.name)
-        solver = LearningSolver(
-            solver=internal_solver,
-            simulate_perfect=True,
-        )
-        stats = solver.solve(PickleGzInstance(tmp.name))
-        assert stats["mip_lower_bound"] == stats["Objective: Predicted lower bound"]
+def test_basic_usage() -> None:
+    with tempfile.TemporaryDirectory() as dirname:
+        # Generate instances
+        data = MaxWeightStableSetGenerator(n=randint(low=20, high=21)).generate(4)
+        train_files = save(data[0:3], f"{dirname}/train")
+        test_files = save(data[3:4], f"{dirname}/test")
+
+        # Solve training instances
+        solver = LearningSolver()
+        stats = solver.solve(train_files, build_stab_model)
+        assert len(stats) == 3
+        for f in train_files:
+            sample_filename = f.replace(".pkl.gz", ".h5")
+            assert exists(sample_filename)
+            sample = Hdf5Sample(sample_filename)
+            assert sample.get_scalar("mip_lower_bound") > 0
+
+        # Fit
+        solver.fit(train_files, build_stab_model)
+
+        # Solve test instances
+        stats = solver.solve(test_files, build_stab_model)
+        assert "Objective: Predicted lower bound" in stats[0].keys()
 
 
 def test_gap() -> None: