Update

2025-12-06 01:18:52 -06:00 · 2022-06-01 11:40:48 -05:00
parent 3fd252659e
commit 6cc253a903
15 changed files with 591 additions and 275 deletions
--- a/8
+++ b/8
@@ -2,7 +2,7 @@ PYTHON      := python3
 PYTEST      := pytest
 PIP         := $(PYTHON) -m pip
 MYPY        := $(PYTHON) -m mypy
-PYTEST_ARGS := -W ignore::DeprecationWarning -vv --log-level=DEBUG
+PYTEST_ARGS := -W ignore::DeprecationWarning -vv --log-level=DEBUG tests
 VERSION     := 0.2

 all: docs test
@@ -41,9 +41,9 @@ reformat:
 	$(PYTHON) -m black .

 test:
-	rm -rf .mypy_cache
-	$(MYPY) -p miplearn
-	$(MYPY) -p tests
+	# rm -rf .mypy_cache
+	# $(MYPY) -p miplearn
+	# $(MYPY) -p tests
 	$(PYTEST) $(PYTEST_ARGS) 

 .PHONY: test test-watch docs install dist
--- a/miplearn/benchmark.py
+++ b/miplearn/benchmark.py
@@ -4,13 +4,13 @@

 import logging
 import os
-from typing import Dict, List, Any, Optional
+from typing import Dict, List, Any, Optional, Callable

 import pandas as pd

 from miplearn.components.component import Component
 from miplearn.instance.base import Instance
-from miplearn.solvers.learning import LearningSolver
+from miplearn.solvers.learning import LearningSolver, FileInstanceWrapper
 from miplearn.solvers.pyomo.gurobi import GurobiPyomoSolver
 from sklearn.utils._testing import ignore_warnings
 from sklearn.exceptions import ConvergenceWarning
@@ -43,37 +43,24 @@ class BenchmarkRunner:

    def parallel_solve(
        self,
-        instances: List[Instance],
+        filenames: List[str],
+        build_model: Callable,
        n_jobs: int = 1,
        n_trials: int = 1,
        progress: bool = False,
    ) -> None:
-        """
-        Solves the given instances in parallel and collect benchmark statistics.
-
-        Parameters
-        ----------
-        instances: List[Instance]
-            List of instances to solve. This can either be a list of instances
-            already loaded in memory, or a list of filenames pointing to pickled (and
-            optionally gzipped) files.
-        n_jobs: int
-            List of instances to solve in parallel at a time.
-        n_trials: int
-            How many times each instance should be solved.
-        """
        self._silence_miplearn_logger()
-        trials = instances * n_trials
+        trials = filenames * n_trials
        for (solver_name, solver) in self.solvers.items():
            results = solver.parallel_solve(
                trials,
+                build_model,
                n_jobs=n_jobs,
-                label="solve (%s)" % solver_name,
-                discard_outputs=True,
+                label="benchmark (%s)" % solver_name,
                progress=progress,
            )
            for i in range(len(trials)):
-                idx = i % len(instances)
+                idx = i % len(filenames)
                results[i]["Solver"] = solver_name
                results[i]["Instance"] = idx
                self.results = self.results.append(pd.DataFrame([results[i]]))
@@ -93,21 +80,15 @@ class BenchmarkRunner:

    def fit(
        self,
-        instances: List[Instance],
+        filenames: List[str],
+        build_model: Callable,
+        progress: bool = False,
        n_jobs: int = 1,
-        progress: bool = True,
    ) -> None:
-        """
-        Trains all solvers with the provided training instances.
-
-        Parameters
-        ----------
-        instances:  List[Instance]
-            List of training instances.
-        n_jobs: int
-            Number of parallel processes to use.
-        """
-        components: List[Component] = []
+        components = []
+        instances: List[Instance] = [
+            FileInstanceWrapper(f, build_model, mode="r") for f in filenames
+        ]
        for (solver_name, solver) in self.solvers.items():
            if solver_name == "baseline":
                continue
@@ -128,6 +109,114 @@ class BenchmarkRunner:
        miplearn_logger = logging.getLogger("miplearn")
        miplearn_logger.setLevel(self.prev_log_level)

+    def write_svg(
+        self,
+        output: Optional[str] = None,
+    ) -> None:
+        import matplotlib.pyplot as plt
+        import pandas as pd
+        import seaborn as sns
+
+        sns.set_style("whitegrid")
+        sns.set_palette("Blues_r")
+        groups = self.results.groupby("Instance")
+        best_lower_bound = groups["mip_lower_bound"].transform("max")
+        best_upper_bound = groups["mip_upper_bound"].transform("min")
+        self.results["Relative lower bound"] = self.results["mip_lower_bound"] / best_lower_bound
+        self.results["Relative upper bound"] = self.results["mip_upper_bound"] / best_upper_bound
+
+        if (self.results["mip_sense"] == "min").any():
+            primal_column = "Relative upper bound"
+            obj_column = "mip_upper_bound"
+            predicted_obj_column = "Objective: Predicted upper bound"
+        else:
+            primal_column = "Relative lower bound"
+            obj_column = "mip_lower_bound"
+            predicted_obj_column = "Objective: Predicted lower bound"
+
+        palette = {
+            "baseline": "#9b59b6",
+            "ml-exact": "#3498db",
+            "ml-heuristic": "#95a5a6",
+        }
+        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(
+            nrows=2,
+            ncols=2,
+            figsize=(8, 8),
+        )
+
+        # Wallclock time
+        sns.stripplot(
+            x="Solver",
+            y="mip_wallclock_time",
+            data=self.results,
+            ax=ax1,
+            jitter=0.25,
+            palette=palette,
+            size=2.0,
+        )
+        sns.barplot(
+            x="Solver",
+            y="mip_wallclock_time",
+            data=self.results,
+            ax=ax1,
+            errwidth=0.0,
+            alpha=0.4,
+            palette=palette,
+        )
+        ax1.set(ylabel="Wallclock time (s)")
+
+        # Gap
+        sns.stripplot(
+            x="Solver",
+            y="Gap",
+            jitter=0.25,
+            data=self.results[self.results["Solver"] != "ml-heuristic"],
+            ax=ax2,
+            palette=palette,
+            size=2.0,
+        )
+        ax2.set(ylabel="Relative MIP gap")
+
+        # Relative primal bound
+        sns.stripplot(
+            x="Solver",
+            y=primal_column,
+            jitter=0.25,
+            data=self.results[self.results["Solver"] == "ml-heuristic"],
+            ax=ax3,
+            palette=palette,
+            size=2.0,
+        )
+        sns.scatterplot(
+            x=obj_column,
+            y=predicted_obj_column,
+            hue="Solver",
+            data=self.results[self.results["Solver"] == "ml-exact"],
+            ax=ax4,
+            palette=palette,
+            size=2.0,
+        )
+
+        # Predicted vs actual primal bound
+        xlim, ylim = ax4.get_xlim(), ax4.get_ylim()
+        ax4.plot(
+            [-1e10, 1e10],
+            [-1e10, 1e10],
+            ls="-",
+            color="#cccccc",
+        )
+        ax4.set_xlim(xlim)
+        ax4.set_ylim(xlim)
+        ax4.get_legend().remove()
+        ax4.set(
+            ylabel="Predicted optimal value",
+            xlabel="Actual optimal value",
+        )
+
+        fig.tight_layout()
+        plt.savefig(output)
+

@ignore_warnings(category=ConvergenceWarning)
 def run_benchmarks(
@@ -173,111 +262,3 @@ def run_benchmarks(
    plot(benchmark.results)


-def plot(
-    results: pd.DataFrame,
-    output: Optional[str] = None,
-) -> None:
-    import matplotlib.pyplot as plt
-    import pandas as pd
-    import seaborn as sns
-
-    sns.set_style("whitegrid")
-    sns.set_palette("Blues_r")
-    groups = results.groupby("Instance")
-    best_lower_bound = groups["mip_lower_bound"].transform("max")
-    best_upper_bound = groups["mip_upper_bound"].transform("min")
-    results["Relative lower bound"] = results["mip_lower_bound"] / best_lower_bound
-    results["Relative upper bound"] = results["mip_upper_bound"] / best_upper_bound
-
-    if (results["mip_sense"] == "min").any():
-        primal_column = "Relative upper bound"
-        obj_column = "mip_upper_bound"
-        predicted_obj_column = "Objective: Predicted upper bound"
-    else:
-        primal_column = "Relative lower bound"
-        obj_column = "mip_lower_bound"
-        predicted_obj_column = "Objective: Predicted lower bound"
-
-    palette = {
-        "baseline": "#9b59b6",
-        "ml-exact": "#3498db",
-        "ml-heuristic": "#95a5a6",
-    }
-    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(
-        nrows=2,
-        ncols=2,
-        figsize=(8, 8),
-    )
-
-    # Wallclock time
-    sns.stripplot(
-        x="Solver",
-        y="mip_wallclock_time",
-        data=results,
-        ax=ax1,
-        jitter=0.25,
-        palette=palette,
-        size=2.0,
-    )
-    sns.barplot(
-        x="Solver",
-        y="mip_wallclock_time",
-        data=results,
-        ax=ax1,
-        errwidth=0.0,
-        alpha=0.4,
-        palette=palette,
-    )
-    ax1.set(ylabel="Wallclock time (s)")
-
-    # Gap
-    sns.stripplot(
-        x="Solver",
-        y="Gap",
-        jitter=0.25,
-        data=results[results["Solver"] != "ml-heuristic"],
-        ax=ax2,
-        palette=palette,
-        size=2.0,
-    )
-    ax2.set(ylabel="Relative MIP gap")
-
-    # Relative primal bound
-    sns.stripplot(
-        x="Solver",
-        y=primal_column,
-        jitter=0.25,
-        data=results[results["Solver"] == "ml-heuristic"],
-        ax=ax3,
-        palette=palette,
-        size=2.0,
-    )
-    sns.scatterplot(
-        x=obj_column,
-        y=predicted_obj_column,
-        hue="Solver",
-        data=results[results["Solver"] == "ml-exact"],
-        ax=ax4,
-        palette=palette,
-        size=2.0,
-    )
-
-    # Predicted vs actual primal bound
-    xlim, ylim = ax4.get_xlim(), ax4.get_ylim()
-    ax4.plot(
-        [-1e10, 1e10],
-        [-1e10, 1e10],
-        ls="-",
-        color="#cccccc",
-    )
-    ax4.set_xlim(xlim)
-    ax4.set_ylim(ylim)
-    ax4.get_legend().remove()
-    ax4.set(
-        ylabel="Predicted value",
-        xlabel="Actual value",
-    )
-
-    fig.tight_layout()
-    if output is not None:
-        plt.savefig(output)
--- a/miplearn/classifiers/init.py
+++ b/miplearn/classifiers/init.py
@@ -38,7 +38,7 @@ class Classifier(ABC):
            np.float16,
            np.float32,
            np.float64,
-        ], f"x_train.dtype shoule be float. Found {x_train.dtype} instead."
+        ], f"x_train.dtype should be float. Found {x_train.dtype} instead."
        assert y_train.dtype == np.bool8
        assert len(x_train.shape) == 2
        assert len(y_train.shape) == 2
--- a/miplearn/classifiers/adaptive.py
+++ b/miplearn/classifiers/adaptive.py
@@ -6,8 +6,10 @@ import logging
 from typing import Dict, Optional

 import numpy as np
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import cross_val_predict
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
@@ -63,6 +65,15 @@ class AdaptiveClassifier(Classifier):
        super().__init__()
        if candidates is None:
            candidates = {
+                "forest(5,10)": CandidateClassifierSpecs(
+                    classifier=ScikitLearnClassifier(
+                        RandomForestClassifier(
+                            n_estimators=5,
+                            min_samples_split=10,
+                        ),
+                    ),
+                    min_samples=100,
+                ),
                "knn(100)": CandidateClassifierSpecs(
                    classifier=ScikitLearnClassifier(
                        KNeighborsClassifier(n_neighbors=100)
@@ -92,7 +103,7 @@ class AdaptiveClassifier(Classifier):

        # If almost all samples belong to the same class, return a fixed prediction and
        # skip all the other steps.
-        if y_train[:, 0].mean() > 0.999 or y_train[:, 1].mean() > 0.999:
+        if y_train[:, 0].mean() > 0.99 or y_train[:, 1].mean() > 0.99:
            self.classifier = CountingClassifier()
            self.classifier.fit(x_train, y_train)
            return
@@ -102,13 +113,17 @@ class AdaptiveClassifier(Classifier):
            if n_samples < specs.min_samples:
                continue
            clf = specs.classifier.clone()
+            if isinstance(clf, ScikitLearnClassifier):
+                proba = cross_val_predict(clf.inner_clf, x_train, y_train[:, 1])
+            else:
                clf.fit(x_train, y_train)
-            proba = clf.predict_proba(x_train)
-            # FIXME: Switch to k-fold cross validation
-            score = roc_auc_score(y_train[:, 1], proba[:, 1])
+                proba = clf.predict_proba(x_train)[:, 1]
+            score = roc_auc_score(y_train[:, 1], proba)
            if score > best_score:
                best_name, best_clf, best_score = name, clf, score
        logger.debug("Best classifier: %s (score=%.3f)" % (best_name, best_score))
+        if isinstance(best_clf, ScikitLearnClassifier):
+            best_clf.fit(x_train, y_train)
        self.classifier = best_clf

    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
--- a/miplearn/classifiers/threshold.py
+++ b/miplearn/classifiers/threshold.py
@@ -7,7 +7,10 @@ from typing import Optional, List

 import numpy as np
 from sklearn.metrics._ranking import _binary_clf_curve
+from sklearn.model_selection import cross_val_predict

+from miplearn.classifiers.sklearn import ScikitLearnClassifier
+from miplearn.classifiers.adaptive import AdaptiveClassifier
 from miplearn.classifiers import Classifier


@@ -95,6 +98,16 @@ class MinPrecisionThreshold(Threshold):
    ) -> None:
        super().fit(clf, x_train, y_train)
        (n_samples, n_classes) = y_train.shape
+        if isinstance(clf, AdaptiveClassifier) and isinstance(
+            clf.classifier, ScikitLearnClassifier
+        ):
+            proba = cross_val_predict(
+                clf.classifier.inner_clf,
+                x_train,
+                y_train[:, 1],
+                method="predict_proba",
+            )
+        else:
            proba = clf.predict_proba(x_train)
        self._computed_threshold = [
            self._compute(
@@ -114,11 +127,13 @@ class MinPrecisionThreshold(Threshold):
        y_actual: np.ndarray,
        y_prob: np.ndarray,
        min_precision: float,
+        min_recall: float = 0.1,
    ) -> float:
        fps, tps, thresholds = _binary_clf_curve(y_actual, y_prob)
        precision = tps / (tps + fps)
+        recall = tps / tps[-1]
        for k in reversed(range(len(precision))):
-            if precision[k] >= min_precision:
+            if precision[k] >= min_precision and recall[k] >= min_recall:
                return thresholds[k]
        return float("inf")

--- a/miplearn/components/dynamic_lazy.py
+++ b/miplearn/components/dynamic_lazy.py
@@ -1,20 +1,23 @@
 #  MIPLearn: Extensible Framework for Learning-Enhanced Mixed-Integer Optimization
 #  Copyright (C) 2020-2021, UChicago Argonne, LLC. All rights reserved.
 #  Released under the modified BSD license. See COPYING.md for more details.
+import json
 import logging
 from typing import Dict, List, TYPE_CHECKING, Tuple, Any, Optional

 import numpy as np
 from overrides import overrides
+from tqdm.auto import tqdm

 from miplearn.classifiers import Classifier
 from miplearn.classifiers.counting import CountingClassifier
 from miplearn.classifiers.threshold import MinProbabilityThreshold, Threshold
 from miplearn.components.component import Component
 from miplearn.components.dynamic_common import DynamicConstraintsComponent
-from miplearn.features.sample import Sample
+from miplearn.features.sample import Sample, Hdf5Sample
 from miplearn.instance.base import Instance
 from miplearn.types import LearningSolveStats, ConstraintName, ConstraintCategory
+from p_tqdm import p_map

 logger = logging.getLogger(__name__)

@@ -41,6 +44,7 @@ class DynamicLazyConstraintsComponent(Component):
        self.thresholds = self.dynamic.thresholds
        self.known_violations = self.dynamic.known_violations
        self.lazy_enforced: Dict[ConstraintName, Any] = {}
+        self.n_iterations: int = 0

    @staticmethod
    def enforce(
@@ -68,6 +72,7 @@ class DynamicLazyConstraintsComponent(Component):
        violations = {c: self.dynamic.known_violations[c] for c in vnames}
        logger.info("Enforcing %d lazy constraints..." % len(vnames))
        self.enforce(violations, instance, model, solver)
+        self.n_iterations = 0

    @overrides
    def after_solve_mip(
@@ -79,6 +84,8 @@ class DynamicLazyConstraintsComponent(Component):
        sample: Sample,
    ) -> None:
        sample.put_scalar("mip_constr_lazy", self.dynamic.encode(self.lazy_enforced))
+        stats["LazyDynamic: Added in callback"] = len(self.lazy_enforced)
+        stats["LazyDynamic: Iterations"] = self.n_iterations

    @overrides
    def iteration_cb(
@@ -90,12 +97,14 @@ class DynamicLazyConstraintsComponent(Component):
        assert solver.internal_solver is not None
        logger.debug("Finding violated lazy constraints...")
        violations = instance.find_violated_lazy_constraints(
-            solver.internal_solver, model
+            solver.internal_solver,
+            model,
        )
        if len(violations) == 0:
            logger.debug("No violations found")
            return False
        else:
+            self.n_iterations += 1
            for v in violations:
                self.lazy_enforced[v] = violations[v]
            logger.debug("    %d violations found" % len(violations))
@@ -142,3 +151,73 @@ class DynamicLazyConstraintsComponent(Component):
        sample: Sample,
    ) -> Dict[ConstraintCategory, Dict[str, float]]:
        return self.dynamic.sample_evaluate(instance, sample)
+
+    # ------------------------------------------------------------------------------------------------------------------
+    # NEW API
+    # ------------------------------------------------------------------------------------------------------------------
+    @staticmethod
+    def extract(filenames, progress=True, known_cids=None):
+        enforced_cids, features = [], []
+        freeze_known_cids = True
+        if known_cids is None:
+            known_cids = set()
+            freeze_known_cids = False
+        for filename in tqdm(
+            filenames,
+            desc="extract (1/2)",
+            disable=not progress,
+        ):
+            with Hdf5Sample(filename, mode="r") as sample:
+                features.append(sample.get_array("lp_var_values"))
+                cids = frozenset(
+                    DynamicConstraintsComponent.decode(
+                        sample.get_scalar("mip_constr_lazy")
+                    ).keys()
+                )
+                enforced_cids.append(cids)
+                if not freeze_known_cids:
+                    known_cids.update(cids)
+
+        x, y, cat, cdata = [], [], [], {}
+        for (j, cid) in enumerate(known_cids):
+            cdata[cid] = json.loads(cid.decode())
+            for i in range(len(features)):
+                cat.append(cid)
+                x.append(features[i])
+                if cid in enforced_cids[i]:
+                    y.append([0, 1])
+                else:
+                    y.append([1, 0])
+        x = np.vstack(x)
+        y = np.vstack(y)
+        cat = np.array(cat)
+        x_dict, y_dict = DynamicLazyConstraintsComponent._split(
+            x,
+            y,
+            cat,
+            progress=progress,
+        )
+        return x_dict, y_dict, cdata
+
+    @staticmethod
+    def _split(x, y, cat, progress=False):
+        # Sort data by categories
+        pi = np.argsort(cat, kind="stable")
+        x = x[pi]
+        y = y[pi]
+        cat = cat[pi]
+
+        # Split categories
+        x_dict = {}
+        y_dict = {}
+        start = 0
+        for end in tqdm(
+            range(len(cat) + 1),
+            desc="extract (2/2)",
+            disable=not progress,
+        ):
+            if (end >= len(cat)) or (cat[start] != cat[end]):
+                x_dict[cat[start]] = x[start:end, :]
+                y_dict[cat[start]] = y[start:end, :]
+                start = end
+        return x_dict, y_dict
--- a/miplearn/components/primal.py
+++ b/miplearn/components/primal.py
@@ -20,6 +20,9 @@ from miplearn.types import (
    Category,
    Solution,
 )
+from miplearn.features.sample import Hdf5Sample
+from p_tqdm import p_map
+from tqdm.auto import tqdm

 logger = logging.getLogger(__name__)

@@ -41,7 +44,7 @@ class PrimalSolutionComponent(Component):
        self,
        classifier: Classifier = AdaptiveClassifier(),
        mode: str = "exact",
-        threshold: Threshold = MinPrecisionThreshold([0.98, 0.98]),
+        threshold: Threshold = MinPrecisionThreshold([0.99, 0.99]),
    ) -> None:
        assert isinstance(classifier, Classifier)
        assert isinstance(threshold, Threshold)
@@ -148,6 +151,7 @@ class PrimalSolutionComponent(Component):
        y: Dict = {}
        instance_features = sample.get_array("static_instance_features")
        mip_var_values = sample.get_array("mip_var_values")
+        lp_var_values = sample.get_array("lp_var_values")
        var_features = sample.get_array("lp_var_features")
        var_names = sample.get_array("static_var_names")
        var_types = sample.get_array("static_var_types")
@@ -176,6 +180,8 @@ class PrimalSolutionComponent(Component):
            # Features
            features = list(instance_features)
            features.extend(var_features[i])
+            if lp_var_values is not None:
+                features.extend(lp_var_values)
            x[category].append(features)

            # Labels
@@ -236,11 +242,100 @@ class PrimalSolutionComponent(Component):
        self,
        x: Dict[Category, np.ndarray],
        y: Dict[Category, np.ndarray],
+        progress: bool = False,
    ) -> None:
-        for category in x.keys():
+        for category in tqdm(x.keys(), desc="fit", disable=not progress):
            clf = self.classifier_prototype.clone()
            thr = self.threshold_prototype.clone()
            clf.fit(x[category], y[category])
            thr.fit(clf, x[category], y[category])
            self.classifiers[category] = clf
            self.thresholds[category] = thr
+
+    # ------------------------------------------------------------------------------------------------------------------
+    # NEW API
+    # ------------------------------------------------------------------------------------------------------------------
+
+    def fit(
+        self,
+        x: Dict[Category, np.ndarray],
+        y: Dict[Category, np.ndarray],
+        progress: bool = False,
+    ) -> None:
+        for category in tqdm(x.keys(), desc="fit", disable=not progress):
+            clf = self.classifier_prototype.clone()
+            thr = self.threshold_prototype.clone()
+            clf.fit(x[category], y[category])
+            thr.fit(clf, x[category], y[category])
+            self.classifiers[category] = clf
+            self.thresholds[category] = thr
+
+    def predict(self, x):
+        y_pred = {}
+        for category in x.keys():
+            assert category in self.classifiers, (
+                f"Classifier for category {category} has not been trained. "
+                f"Please call component.fit before component.predict."
+            )
+            xc = np.array(x[category])
+            proba = self.classifiers[category].predict_proba(xc)
+            thr = self.thresholds[category].predict(xc)
+            y_pred[category] = np.vstack(
+                [
+                    proba[:, 0] >= thr[0],
+                    proba[:, 1] >= thr[1],
+                ]
+            ).T
+        return y_pred
+
+    @staticmethod
+    def extract(
+        filenames: List[str],
+        progress: bool = False,
+    ):
+        x, y, cat = [], [], []
+
+        # Read data
+        for filename in tqdm(
+            filenames,
+            desc="extract (1/2)",
+            disable=not progress,
+        ):
+            with Hdf5Sample(filename, mode="r") as sample:
+                mip_var_values = sample.get_array("mip_var_values")
+                var_features = sample.get_array("lp_var_features")
+                var_types = sample.get_array("static_var_types")
+                var_categories = sample.get_array("static_var_categories")
+                assert var_features is not None
+                assert var_types is not None
+                assert var_categories is not None
+                x.append(var_features)
+                y.append([mip_var_values < 0.5, mip_var_values > 0.5])
+                cat.extend(var_categories)
+
+        # Convert to numpy arrays
+        x = np.vstack(x)
+        y = np.hstack(y).T
+        cat = np.array(cat)
+
+        # Sort data by categories
+        pi = np.argsort(cat, kind="stable")
+        x = x[pi]
+        y = y[pi]
+        cat = cat[pi]
+
+        # Split categories
+        x_dict = {}
+        y_dict = {}
+        start = 0
+        for end in tqdm(
+            range(len(cat) + 1),
+            desc="extract (2/2)",
+            disable=not progress,
+        ):
+            if (end >= len(cat)) or (cat[start] != cat[end]):
+                x_dict[cat[start]] = x[start:end, :]
+                y_dict[cat[start]] = y[start:end, :]
+                start = end
+
+        return x_dict, y_dict
--- a/miplearn/features/sample.py
+++ b/miplearn/features/sample.py
@@ -224,3 +224,9 @@ class Hdf5Sample(Sample):
            value, (bytes, bytearray)
        ), f"bytes expected; found: {value.__class__}"  # type: ignore
        self.put_array(key, np.frombuffer(value, dtype="uint8"))
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.file.close()
--- a/miplearn/instance/picklegz.py
+++ b/miplearn/instance/picklegz.py
@@ -14,6 +14,8 @@ from overrides import overrides
 from miplearn.features.sample import Sample
 from miplearn.instance.base import Instance
 from miplearn.types import ConstraintName
+from tqdm.auto import tqdm
+from p_tqdm import p_umap

 if TYPE_CHECKING:
    from miplearn.solvers.learning import InternalSolver
@@ -155,13 +157,20 @@ def write_pickle_gz_multiple(objs: List[Any], dirname: str) -> None:
        write_pickle_gz(obj, f"{dirname}/{i:05d}.pkl.gz")


-def save(objs: List[Any], dirname: str) -> List[str]:
+def save(
+    objs: List[Any],
+    dirname: str,
+    progress: bool = False,
+    n_jobs: int = 1,
+) -> List[str]:
    """
    Saves the provided objects to gzipped pickled files. Files are named sequentially
    as `dirname/00000.pkl.gz`, `dirname/00001.pkl.gz`, etc.

    Parameters
    ----------
+    progress: bool
+        If True, show progress bar
    objs: List[any]
        List of files to save
    dirname: str
@@ -171,11 +180,12 @@ def save(objs: List[Any], dirname: str) -> List[str]:
    -------
    List containing the relative paths of the saved files.
    """
-    filenames = []
-    for (i, obj) in enumerate(objs):
-        filename = f"{dirname}/{i:05d}.pkl.gz"
-        filenames.append(filename)
+
+    def _process(obj, filename):
        write_pickle_gz(obj, filename)
+
+    filenames = [f"{dirname}/{i:05d}.pkl.gz" for i in range(len(objs))]
+    p_umap(_process, objs, filenames, num_cpus=n_jobs)
    return filenames


--- a/miplearn/problems/knapsack.py
+++ b/miplearn/problems/knapsack.py
@@ -57,8 +57,8 @@ class MultiKnapsackInstance(Instance):
        model = pe.ConcreteModel()
        model.x = pe.Var(range(self.n), domain=pe.Binary)
        model.OBJ = pe.Objective(
-            expr=sum(model.x[j] * self.prices[j] for j in range(self.n)),
-            sense=pe.maximize,
+            expr=sum(-model.x[j] * self.prices[j] for j in range(self.n)),
+            sense=pe.minimize,
        )
        model.eq_capacity = pe.ConstraintList()
        for i in range(self.m):
@@ -82,6 +82,7 @@ class MultiKnapsackGenerator:
        alpha: rv_frozen = uniform(loc=0.25, scale=0.0),
        fix_w: bool = False,
        w_jitter: rv_frozen = uniform(loc=1.0, scale=0.0),
+        p_jitter: rv_frozen = uniform(loc=1.0, scale=0.0),
        round: bool = True,
    ):
        """Initialize the problem generator.
@@ -165,6 +166,7 @@ class MultiKnapsackGenerator:
        self.K = K
        self.alpha = alpha
        self.w_jitter = w_jitter
+        self.p_jitter = p_jitter
        self.round = round
        self.fix_n: Optional[int] = None
        self.fix_m: Optional[int] = None
@@ -179,8 +181,8 @@ class MultiKnapsackGenerator:
            self.fix_u = self.u.rvs(self.fix_n)
            self.fix_K = self.K.rvs()

-    def generate(self, n_samples: int) -> List[MultiKnapsackInstance]:
-        def _sample() -> MultiKnapsackInstance:
+    def generate(self, n_samples: int) -> List[MultiKnapsackData]:
+        def _sample() -> MultiKnapsackData:
            if self.fix_w is not None:
                assert self.fix_m is not None
                assert self.fix_n is not None
@@ -199,12 +201,30 @@ class MultiKnapsackGenerator:
                K = self.K.rvs()
            w = w * np.array([self.w_jitter.rvs(n) for _ in range(m)])
            alpha = self.alpha.rvs(m)
-            p = np.array([w[:, j].sum() / m + K * u[j] for j in range(n)])
+            p = np.array(
+                [w[:, j].sum() / m + K * u[j] for j in range(n)]
+            ) * self.p_jitter.rvs(n)
            b = np.array([w[i, :].sum() * alpha[i] for i in range(m)])
            if self.round:
                p = p.round()
                b = b.round()
                w = w.round()
-            return MultiKnapsackInstance(p, b, w)
+            return MultiKnapsackData(p, b, w)

        return [_sample() for _ in range(n_samples)]
+
+
+def build_multiknapsack_model(data: MultiKnapsackData) -> pe.ConcreteModel:
+    model = pe.ConcreteModel()
+    m, n = data.weights.shape
+    model.x = pe.Var(range(n), domain=pe.Binary)
+    model.OBJ = pe.Objective(
+        expr=sum(-model.x[j] * data.prices[j] for j in range(n)),
+        sense=pe.minimize,
+    )
+    model.eq_capacity = pe.ConstraintList()
+    for i in range(m):
+        model.eq_capacity.add(
+            sum(model.x[j] * data.weights[i, j] for j in range(n)) <= data.capacities[i]
+        )
+    return model
--- a/miplearn/problems/stab.py
+++ b/miplearn/problems/stab.py
@@ -114,8 +114,8 @@ def build_stab_model(data: MaxWeightStableSetData) -> pe.ConcreteModel:
    nodes = list(data.graph.nodes)
    model.x = pe.Var(nodes, domain=pe.Binary)
    model.OBJ = pe.Objective(
-        expr=sum(model.x[v] * data.weights[v] for v in nodes),
-        sense=pe.maximize,
+        expr=sum(-model.x[v] * data.weights[v] for v in nodes),
+        sense=pe.minimize,
    )
    model.clique_eqs = pe.ConstraintList()
    for clique in nx.find_cliques(data.graph):
--- a/miplearn/solvers/learning.py
+++ b/miplearn/solvers/learning.py
@@ -22,34 +22,57 @@ from miplearn.instance.base import Instance
 from miplearn.solvers import _RedirectOutput
 from miplearn.solvers.internal import InternalSolver
 from miplearn.solvers.pyomo.gurobi import GurobiPyomoSolver
-from miplearn.types import LearningSolveStats
+from miplearn.types import LearningSolveStats, ConstraintName
 import gzip
 import pickle
 import miplearn
+import json
 from os.path import exists
+from os import remove
+import pyomo.environ as pe
+

 logger = logging.getLogger(__name__)


+class PyomoFindLazyCutCallbackHandler:
+    def __init__(self):
+        pass
+
+    def value(self, var):
+        return var.value
+
+
+class PyomoEnforceLazyCutsCallbackHandler:
+    def __init__(self, opt, model):
+        self.model = model
+        self.opt = opt
+        if not hasattr(model, "miplearn_lazy_cb"):
+            model.miplearn_lazy_cb = pe.ConstraintList()
+
+    def enforce(self, expr):
+        constr = self.model.miplearn_lazy_cb.add(expr=expr)
+        self.opt.add_constraint(constr)
+
+
 class FileInstanceWrapper(Instance):
    def __init__(
-        self,
-        data_filename: Any,
-        build_model: Callable,
+        self, data_filename: Any, build_model: Callable, mode: Optional[str] = None
    ):
        super().__init__()
        assert data_filename.endswith(".pkl.gz")
        self.filename = data_filename
        self.sample_filename = data_filename.replace(".pkl.gz", ".h5")
-        self.sample = Hdf5Sample(
-            self.sample_filename,
-            mode="r+" if exists(self.sample_filename) else "w",
-        )
        self.build_model = build_model
+        self.mode = mode
+        self.sample = None
+        self.model = None

    @overrides
    def to_model(self) -> Any:
-        return miplearn.load(self.filename, self.build_model)
+        if self.model is None:
+            self.model = miplearn.load(self.filename, self.build_model)
+        return self.model

    @overrides
    def create_sample(self) -> Sample:
@@ -59,6 +82,44 @@ class FileInstanceWrapper(Instance):
    def get_samples(self) -> List[Sample]:
        return [self.sample]

+    @overrides
+    def free(self) -> None:
+        self.sample.file.close()
+
+    @overrides
+    def load(self) -> None:
+        if self.mode is None:
+            self.mode = "r+" if exists(self.sample_filename) else "w"
+        self.sample = Hdf5Sample(self.sample_filename, mode=self.mode)
+
+    @overrides
+    def has_dynamic_lazy_constraints(self) -> bool:
+        assert hasattr(self, "model")
+        return hasattr(self.model, "_miplearn_find_lazy_cuts")
+
+    @overrides
+    def find_violated_lazy_constraints(
+        self,
+        solver: "InternalSolver",
+        model: Any,
+    ) -> Dict[ConstraintName, Any]:
+        if not hasattr(self.model, "_miplearn_find_lazy_cuts"):
+            return {}
+        cb = PyomoFindLazyCutCallbackHandler()
+        violations = model._miplearn_find_lazy_cuts(cb)
+        return {json.dumps(v).encode(): v for v in violations}
+
+    @overrides
+    def enforce_lazy_constraint(
+        self,
+        solver: "InternalSolver",
+        model: Any,
+        violation: Any,
+    ) -> None:
+        assert isinstance(solver, GurobiPyomoSolver)
+        cb = PyomoEnforceLazyCutsCallbackHandler(solver._pyomo_solver, model)
+        model._miplearn_enforce_lazy_cuts(cb, violation)
+

 class MemoryInstanceWrapper(Instance):
    def __init__(self, model: Any) -> None:
@@ -70,12 +131,39 @@ class MemoryInstanceWrapper(Instance):
    def to_model(self) -> Any:
        return self.model

+    @overrides
+    def has_dynamic_lazy_constraints(self) -> bool:
+        assert hasattr(self, "model")
+        return hasattr(self.model, "_miplearn_find_lazy_cuts")
+
+    @overrides
+    def find_violated_lazy_constraints(
+        self,
+        solver: "InternalSolver",
+        model: Any,
+    ) -> Dict[ConstraintName, Any]:
+        cb = PyomoFindLazyCutCallbackHandler()
+        violations = model._miplearn_find_lazy_cuts(cb)
+        return {json.dumps(v).encode(): v for v in violations}
+
+    @overrides
+    def enforce_lazy_constraint(
+        self,
+        solver: "InternalSolver",
+        model: Any,
+        violation: Any,
+    ) -> None:
+        assert isinstance(solver, GurobiPyomoSolver)
+        cb = PyomoEnforceLazyCutsCallbackHandler(solver._pyomo_solver, model)
+        model._miplearn_enforce_lazy_cuts(cb, violation)
+

 class _GlobalVariables:
    def __init__(self) -> None:
        self.solver: Optional[LearningSolver] = None
-        self.instances: Optional[List[Instance]] = None
-        self.discard_outputs: bool = False
+        self.build_model: Optional[Callable] = None
+        self.filenames: Optional[List[str]] = None
+        self.skip = False


 # Global variables used for multiprocessing. Global variables are copied by the
@@ -86,23 +174,19 @@ _GLOBAL = [_GlobalVariables()]

 def _parallel_solve(
    idx: int,
-) -> Tuple[Optional[int], Optional[LearningSolveStats], Optional[Instance]]:
+) -> Tuple[Optional[int], Optional[LearningSolveStats]]:
    solver = _GLOBAL[0].solver
-    instances = _GLOBAL[0].instances
-    discard_outputs = _GLOBAL[0].discard_outputs
+    filenames = _GLOBAL[0].filenames
+    build_model = _GLOBAL[0].build_model
+    skip = _GLOBAL[0].skip
    assert solver is not None
-    assert instances is not None
    try:
-        stats = solver._solve(
-            instances[idx],
-            discard_output=discard_outputs,
-        )
-        instances[idx].free()
-        return idx, stats, instances[idx]
+        stats = solver.solve([filenames[idx]], build_model, skip=skip)
+        return idx, stats[0]
    except Exception as e:
        traceback.print_exc()
-        logger.exception(f"Exception while solving {instances[idx]}. Ignoring.")
-        return None, None, None
+        logger.exception(f"Exception while solving {filenames[idx]}. Ignoring.")
+        return idx, None


 class LearningSolver:
@@ -380,85 +464,84 @@ class LearningSolver:
        build_model: Optional[Callable] = None,
        tee: bool = False,
        progress: bool = False,
+        skip: bool = False,
    ) -> Union[LearningSolveStats, List[LearningSolveStats]]:
        if isinstance(arg, list):
            assert build_model is not None
            stats = []
            for i in tqdm(arg, disable=not progress):
-                s = self._solve(FileInstanceWrapper(i, build_model), tee=tee)
+                instance = FileInstanceWrapper(i, build_model)
+                solved = False
+                if exists(instance.sample_filename):
+                    try:
+                        with Hdf5Sample(instance.sample_filename, mode="r") as sample:
+                            if sample.get_scalar("mip_lower_bound"):
+                                solved = True
+                    except OSError:
+                        # File exists but it is unreadable/corrupted. Delete it.
+                        remove(instance.sample_filename)
+                if solved and skip:
+                    stats.append({})
+                else:
+                    s = self._solve(instance, tee=tee)
+
+                    # Export to gzipped MPS file
+                    mps_filename = instance.sample_filename.replace(".h5", ".mps")
+                    instance.model.write(
+                        filename=mps_filename,
+                        io_options={
+                            "labeler": pe.NameLabeler(),
+                            "skip_objective_sense": True,
+                        },
+                    )
+                    with open(mps_filename, "rb") as original:
+                        with gzip.open(f"{mps_filename}.gz", "wb") as compressed:
+                            compressed.writelines(original)
+                    remove(mps_filename)
+
                    stats.append(s)
            return stats
        else:
            return self._solve(MemoryInstanceWrapper(arg), tee=tee)

-    def fit(self, filenames: List[str], build_model: Callable) -> None:
+    def fit(
+        self,
+        filenames: List[str],
+        build_model: Callable,
+        progress: bool = False,
+        n_jobs: int = 1,
+    ) -> None:
        instances: List[Instance] = [
-            FileInstanceWrapper(f, build_model) for f in filenames
+            FileInstanceWrapper(f, build_model, mode="r") for f in filenames
        ]
-        self._fit(instances)
+        self._fit(instances, progress=progress, n_jobs=n_jobs)

    def parallel_solve(
        self,
-        instances: List[Instance],
+        filenames: List[str],
+        build_model: Optional[Callable] = None,
        n_jobs: int = 4,
-        label: str = "solve",
-        discard_outputs: bool = False,
        progress: bool = False,
+        label: str = "solve",
+        skip: bool = False,
    ) -> List[LearningSolveStats]:
-        """
-        Solves multiple instances in parallel.
-
-        This method is equivalent to calling `solve` for each item on the list,
-        but it processes multiple instances at the same time. Like `solve`, this
-        method modifies each instance in place. Also like `solve`, a list of
-        filenames may be provided.
-
-        Parameters
-        ----------
-        discard_outputs: bool
-            If True, do not write the modified instances anywhere; simply discard
-            them instead. Useful during benchmarking.
-        label: str
-            Label to show in the progress bar.
-        instances: List[Instance]
-            The instances to be solved.
-        n_jobs: int
-            Number of instances to solve in parallel at a time.
-
-        Returns
-        -------
-        List[LearningSolveStats]
-            List of solver statistics, with one entry for each provided instance.
-            The list is the same you would obtain by calling
-            `[solver.solve(p) for p in instances]`
-        """
-        if n_jobs == 1:
-            return [
-                self._solve(p)
-                for p in tqdm(
-                    instances,
-                    disable=not progress,
-                    desc=label,
-                )
-            ]
-        else:
        self.internal_solver = None
        self._silence_miplearn_logger()
        _GLOBAL[0].solver = self
-            _GLOBAL[0].instances = instances
-            _GLOBAL[0].discard_outputs = discard_outputs
+        _GLOBAL[0].build_model = build_model
+        _GLOBAL[0].filenames = filenames
+        _GLOBAL[0].skip = skip
        results = p_umap(
            _parallel_solve,
-                list(range(len(instances))),
+            list(range(len(filenames))),
            num_cpus=n_jobs,
-                desc=label,
            disable=not progress,
+            desc=label,
        )
-            results = [r for r in results if r[1]]
-            stats: List[LearningSolveStats] = [{} for _ in range(len(results))]
-            for (idx, s, instance) in results:
+        stats: List[LearningSolveStats] = [{} for _ in range(len(filenames))]
+        for (idx, s) in results:
+            if s:
                stats[idx] = s
-                instances[idx] = instance
        self._restore_miplearn_logger()
        return stats

--- a/miplearn/solvers/pyomo/base.py
+++ b/miplearn/solvers/pyomo/base.py
@@ -60,6 +60,7 @@ class BasePyomoSolver(InternalSolver):
        self._obj_sense: str = "min"
        self._varname_to_var: Dict[bytes, pe.Var] = {}
        self._varname_to_idx: Dict[str, int] = {}
+        self._name_buffer = {}
        self._cname_to_constr: Dict[str, pe.Constraint] = {}
        self._termination_condition: str = ""
        self._has_lp_solution = False
@@ -203,12 +204,12 @@ class BasePyomoSolver(InternalSolver):
                            if isinstance(term, MonomialTermExpression):
                                lhs_row.append(row)
                                lhs_col.append(
-                                    self._varname_to_idx[term._args_[1].name]
+                                    self._varname_to_idx[self.name(term._args_[1])]
                                )
                                lhs_data.append(float(term._args_[0]))
                            elif isinstance(term, _GeneralVarData):
                                lhs_row.append(row)
-                                lhs_col.append(self._varname_to_idx[term.name])
+                                lhs_col.append(self._varname_to_idx[self.name(term)])
                                lhs_data.append(1.0)
                            else:
                                raise Exception(
@@ -216,7 +217,7 @@ class BasePyomoSolver(InternalSolver):
                                )
                    elif isinstance(expr, _GeneralVarData):
                        lhs_row.append(row)
-                        lhs_col.append(self._varname_to_idx[expr.name])
+                        lhs_col.append(self._varname_to_idx[self.name(expr)])
                        lhs_data.append(1.0)
                    else:
                        raise Exception(
@@ -235,11 +236,11 @@ class BasePyomoSolver(InternalSolver):
        for (i, constr) in enumerate(model.component_objects(pyomo.core.Constraint)):
            if isinstance(constr, pe.ConstraintList):
                for idx in constr:
-                    names.append(constr[idx].name)
+                    names.append(self.name(constr[idx]))
                    _parse_constraint(constr[idx], curr_row)
                    curr_row += 1
            else:
-                names.append(constr.name)
+                names.append(self.name(constr))
                _parse_constraint(constr, curr_row)
                curr_row += 1

@@ -276,7 +277,7 @@ class BasePyomoSolver(InternalSolver):
            for index in var:
                if var[index].fixed:
                    continue
-                solution[var[index].name.encode()] = var[index].value
+                solution[self.name(var[index]).encode()] = var[index].value
        return solution

    @overrides
@@ -301,9 +302,9 @@ class BasePyomoSolver(InternalSolver):

                # Variable name
                if idx is None:
-                    names.append(var.name)
+                    names.append(self.name(var))
                else:
-                    names.append(var[idx].name)
+                    names.append(self.name(var[idx]))

                if with_static:
                    # Variable type
@@ -332,8 +333,9 @@ class BasePyomoSolver(InternalSolver):
                        lower_bounds.append(-float("inf"))

                    # Objective coefficient
-                    if v.name in self._obj:
-                        obj_coeffs.append(self._obj[v.name])
+                    name = self.name(v)
+                    if name in self._obj:
+                        obj_coeffs.append(self._obj[name])
                    else:
                        obj_coeffs.append(0.0)

@@ -544,13 +546,13 @@ class BasePyomoSolver(InternalSolver):
        if isinstance(expr, SumExpression):
            for term in expr._args_:
                if isinstance(term, MonomialTermExpression):
-                    lhs[term._args_[1].name] = float(term._args_[0])
+                    lhs[self.name(term._args_[1])] = float(term._args_[0])
                elif isinstance(term, _GeneralVarData):
-                    lhs[term.name] = 1.0
+                    lhs[self.name(term)] = 1.0
                else:
                    raise Exception(f"Unknown term type: {term.__class__.__name__}")
        elif isinstance(expr, _GeneralVarData):
-            lhs[expr.name] = 1.0
+            lhs[self.name(expr)] = 1.0
        else:
            raise Exception(f"Unknown expression type: {expr.__class__.__name__}")
        return lhs
@@ -581,9 +583,9 @@ class BasePyomoSolver(InternalSolver):
        self._varname_to_idx = {}
        for var in self.model.component_objects(Var):
            for idx in var:
-                varname = var.name
+                varname = self.name(var)
                if idx is not None:
-                    varname = var[idx].name
+                    varname = self.name(var[idx])
                self._varname_to_var[varname.encode()] = var[idx]
                self._varname_to_idx[varname] = len(self._all_vars)
                self._all_vars += [var[idx]]
@@ -599,9 +601,12 @@ class BasePyomoSolver(InternalSolver):
        for constr in self.model.component_objects(pyomo.core.Constraint):
            if isinstance(constr, pe.ConstraintList):
                for idx in constr:
-                    self._cname_to_constr[constr[idx].name] = constr[idx]
+                    self._cname_to_constr[self.name(constr[idx])] = constr[idx]
            else:
-                self._cname_to_constr[constr.name] = constr
+                self._cname_to_constr[self.name(constr)] = constr
+
+    def name(self, comp):
+        return comp.getname(name_buffer=self._name_buffer)


 class PyomoTestInstanceInfeasible(Instance):
--- a/miplearn/solvers/pyomo/gurobi.py
+++ b/miplearn/solvers/pyomo/gurobi.py
@@ -52,3 +52,10 @@ class GurobiPyomoSolver(BasePyomoSolver):
    @overrides
    def _get_node_count_regexp(self) -> Optional[str]:
        return None
+
+    def set_priorities(self, priorities):
+        for (var_name, priority) in priorities.items():
+            pvar = self._varname_to_var[var_name]
+            gvar = self._pyomo_solver._pyomo_var_to_solver_var_map[pvar]
+            gvar.branchPriority = priority
+        return None
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ setup(
    python_requires=">=3.7",
    install_requires=[
        "decorator>=4,<5",
-        "h5py>=3,<4",
+        "h5py==3.5.0",
        "matplotlib>=3,<4",
        "mypy==0.790",
        "networkx>=2,<3",