From 08e808690ee571de1c00bb556eceb16d7a357022 Mon Sep 17 00:00:00 2001
From: "Alinson S. Xavier" <git@axavier.org>
Date: Sun, 4 Apr 2021 14:48:46 -0500
Subject: [PATCH] Replace InstanceIterator by PickleGzInstance

---
 benchmark/benchmark.py                      |  30 ++--
 docs/usage.md                               |  43 ++----
 miplearn/__init__.py                        |   8 +-
 miplearn/benchmark.py                       |  13 +-
 miplearn/components/component.py            |  14 +-
 miplearn/components/lazy_dynamic.py         |   4 +-
 miplearn/components/objective.py            |  10 +-
 miplearn/components/steps/convert_tight.py  |   5 +-
 miplearn/components/steps/drop_redundant.py |  40 +++---
 miplearn/extractors.py                      |  39 +-----
 miplearn/instance.py                        | 148 ++++++++++++++++++--
 miplearn/solvers/learning.py                |  88 +++---------
 tests/solvers/test_learning_solver.py       |  50 ++-----
 tests/test_instance.py                      |  16 +++
 14 files changed, 252 insertions(+), 256 deletions(-)
 create mode 100644 tests/test_instance.py
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index de5a619..7f2881c 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -40,6 +40,8 @@ from miplearn import (
     BenchmarkRunner,
     GurobiPyomoSolver,
     setup_logger,
+    PickleGzInstance,
+    write_pickle_gz_multiple,
 )
 
 setup_logger()
@@ -48,24 +50,6 @@ logging.getLogger("pyomo.core").setLevel(logging.ERROR)
 logger = logging.getLogger("benchmark")
 
 
-def write_pickle_gz(obj, filename):
-    logger.info(f"Writing: {filename}")
-    os.makedirs(os.path.dirname(filename), exist_ok=True)
-    with gzip.GzipFile(filename, "wb") as file:
-        pickle.dump(obj, file)
-
-
-def read_pickle_gz(filename):
-    logger.info(f"Reading: {filename}")
-    with gzip.GzipFile(filename, "rb") as file:
-        return pickle.load(file)
-
-
-def write_pickle_gz_multiple(objs, dirname):
-    for (i, obj) in enumerate(objs):
-        write_pickle_gz(obj, f"{dirname}/{i:05d}.pkl.gz")
-
-
 def train(args):
     basepath = args["<challenge>"]
     problem_name, challenge_name = args["<challenge>"].split("/")
@@ -78,7 +62,9 @@ def train(args):
 
     done_filename = f"{basepath}/train/done"
     if not os.path.isfile(done_filename):
-        train_instances = glob.glob(f"{basepath}/train/*.gz")
+        train_instances = [
+            PickleGzInstance(f) for f in glob.glob(f"{basepath}/train/*.gz")
+        ]
         solver = LearningSolver(
             solver=lambda: GurobiPyomoSolver(
                 params={
@@ -96,7 +82,7 @@ def train(args):
 
 def test_baseline(args):
     basepath = args["<challenge>"]
-    test_instances = glob.glob(f"{basepath}/test/*.gz")
+    test_instances = [PickleGzInstance(f) for f in glob.glob(f"{basepath}/test/*.gz")]
     csv_filename = f"{basepath}/benchmark_baseline.csv"
     if not os.path.isfile(csv_filename):
         solvers = {
@@ -119,8 +105,8 @@ def test_baseline(args):
 
 def test_ml(args):
     basepath = args["<challenge>"]
-    test_instances = glob.glob(f"{basepath}/test/*.gz")
-    train_instances = glob.glob(f"{basepath}/train/*.gz")
+    test_instances = [PickleGzInstance(f) for f in glob.glob(f"{basepath}/test/*.gz")]
+    train_instances = [PickleGzInstance(f) for f in glob.glob(f"{basepath}/train/*.gz")]
     csv_filename = f"{basepath}/benchmark_ml.csv"
     if not os.path.isfile(csv_filename):
         solvers = {
diff --git a/docs/usage.md b/docs/usage.md
index 750d1d2..93dab8c 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -119,11 +119,10 @@ For more significant performance benefits, `LearningSolver` can also be configur
 
 ### 6.1 Saving and loading solver state
 
-After solving a large number of training instances, it may be desirable to save the current state of `LearningSolver` to disk, so that the solver can still use the acquired knowledge after the application restarts. This can be accomplished by using the standard `pickle` module, as the following example illustrates:
+After solving a large number of training instances, it may be desirable to save the current state of `LearningSolver` to disk, so that the solver can still use the acquired knowledge after the application restarts. This can be accomplished by using the the utility functions `write_pickle_gz` and `read_pickle_gz`, as the following example illustrates:
 
 ```python
-from miplearn import LearningSolver
-import pickle
+from miplearn import LearningSolver, write_pickle_gz, read_pickle_gz
 
 # Solve training instances
 training_instances = [...]
@@ -135,14 +134,12 @@ for instance in training_instances:
 solver.fit(training_instances)
 
 # Save trained solver to disk
-with open("solver.pickle", "wb") as file:
-    pickle.dump(solver, file)
+write_pickle_gz(solver, "solver.pkl.gz")
 
 # Application restarts...
 
 # Load trained solver from disk
-with open("solver.pickle", "rb") as file:
-    solver = pickle.load(file)
+solver = read_pickle_gz("solver.pkl.gz")
 
 # Solve additional instances
 test_instances = [...]
@@ -171,23 +168,24 @@ solver.parallel_solve(test_instances)
 
 ### 6.3 Solving instances from the disk
 
-In all examples above, we have assumed that instances are available as Python objects, stored in memory. When problem instances are very large, or when there is a large number of problem instances, this approach may require an excessive amount of memory. To reduce memory requirements, MIPLearn can also operate on instances that are stored on disk. More precisely, the methods `fit`, `solve` and `parallel_solve` in `LearningSolver` can operate on filenames (or lists of filenames) instead of instance objects, as the next example illustrates.
-Instance files must be pickled instance objects. The method `solve` loads at most one instance to memory at a time, while `parallel_solve` loads at most `n_jobs` instances.
-
+In all examples above, we have assumed that instances are available as Python objects, stored in memory. When problem instances are very large, or when there is a large number of problem instances, this approach may require an excessive amount of memory. To reduce memory requirements, MIPLearn can also operate on instances that are stored on disk, through the `PickleGzInstance` class, as the next example illustrates.
 
 ```python
 import pickle
-from miplearn import LearningSolver
+from miplearn import (
+    LearningSolver,
+    PickleGzInstance,
+    write_pickle_gz,
+)
 
 # Construct and pickle 600 problem instances
 for i in range(600):
     instance = MyProblemInstance([...])
-    with open("instance_%03d.pkl" % i, "w") as file:
-        pickle.dump(instance, obj)
+    write_pickle_gz(instance, "instance_%03d.pkl" % i)
         
 # Split instances into training and test
-test_instances  = ["instance_%03d.pkl" % i for i in range(500)]
-train_instances = ["instance_%03d.pkl" % i for i in range(500, 600)]
+test_instances  = [PickleGzInstance("instance_%03d.pkl" % i) for i in range(500)]
+train_instances = [PickleGzInstance("instance_%03d.pkl" % i) for i in range(500, 600)]
 
 # Create solver
 solver = LearningSolver([...])
@@ -203,20 +201,7 @@ solver.parallel_solve(test_instances, n_jobs=4)
 ```
 
 
-By default, `solve` and `parallel_solve` modify files in place. That is, after the instances are loaded from disk and solved, MIPLearn writes them back to the disk, overwriting the original files. To write to an alternative file instead, use the arguments `output_filename` (in `solve`) and `output_filenames` (in `parallel_solve`). To discard the modifications instead, use `discard_outputs=True`. This can be useful, for example, during benchmarks.
-
-```python
-# Solve a single instance file and write the output to another file
-solver.solve("knapsack_1.orig.pkl", output_filename="knapsack_1.solved.pkl")
-
-# Solve a list of instance files
-instances = ["knapsack_%03d.orig.pkl" % i for i in range(100)]
-output = ["knapsack_%03d.solved.pkl" % i for i in range(100)]
-solver.parallel_solve(instances, output_filenames=output)
-
-# Solve instances and discard solutions and training data
-solver.parallel_solve(instances, discard_outputs=True)
-```
+By default, `solve` and `parallel_solve` modify files in place. That is, after the instances are loaded from disk and solved, MIPLearn writes them back to the disk, overwriting the original files. To discard the modifications instead, use `LearningSolver(..., discard_outputs=True)`. This can be useful, for example, during benchmarks.
 
 ## 7. Running benchmarks
 
diff --git a/miplearn/__init__.py b/miplearn/__init__.py
index 351d6f7..5964a12 100644
--- a/miplearn/__init__.py
+++ b/miplearn/__init__.py
@@ -23,7 +23,13 @@ from .components.steps.convert_tight import ConvertTightIneqsIntoEqsStep
 from .components.steps.drop_redundant import DropRedundantInequalitiesStep
 from .components.steps.relax_integrality import RelaxIntegralityStep
 from .extractors import InstanceFeaturesExtractor
-from .instance import Instance
+from .instance import (
+    Instance,
+    PickleGzInstance,
+    write_pickle_gz,
+    write_pickle_gz_multiple,
+    read_pickle_gz,
+)
 from .log import setup_logger
 from .solvers.gurobi import GurobiSolver
 from .solvers.internal import InternalSolver
diff --git a/miplearn/benchmark.py b/miplearn/benchmark.py
index b4ab2b3..20ff04e 100644
--- a/miplearn/benchmark.py
+++ b/miplearn/benchmark.py
@@ -52,7 +52,7 @@ class BenchmarkRunner:
 
     def parallel_solve(
         self,
-        instances: Union[List[str], List[Instance]],
+        instances: List[Instance],
         n_jobs: int = 1,
         n_trials: int = 3,
     ) -> None:
@@ -61,7 +61,7 @@ class BenchmarkRunner:
 
         Parameters
         ----------
-        instances: Union[List[str], List[Instance]]
+        instances: List[Instance]
             List of instances to solve. This can either be a list of instances
             already loaded in memory, or a list of filenames pointing to pickled (and
             optionally gzipped) files.
@@ -98,17 +98,14 @@ class BenchmarkRunner:
         os.makedirs(os.path.dirname(filename), exist_ok=True)
         self.results.to_csv(filename)
 
-    def fit(self, instances: Union[List[str], List[Instance]]) -> None:
+    def fit(self, instances: List[Instance]) -> None:
         """
         Trains all solvers with the provided training instances.
 
         Parameters
         ----------
-        instances:  Union[List[str], List[Instance]]
-            List of training instances. This can either be a list of instances
-            already loaded in memory, or a list of filenames pointing to pickled (and
-            optionally gzipped) files.
-
+        instances:  List[Instance]
+            List of training instances.
         """
         for (solver_name, solver) in self.solvers.items():
             logger.debug(f"Fitting {solver_name}...")
diff --git a/miplearn/components/component.py b/miplearn/components/component.py
index 5a6b991..a182a44 100644
--- a/miplearn/components/component.py
+++ b/miplearn/components/component.py
@@ -2,10 +2,10 @@
 #  Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
 #  Released under the modified BSD license. See COPYING.md for more details.
 
+from typing import Any, List, TYPE_CHECKING, Tuple, Dict, Hashable
+
 import numpy as np
-from typing import Any, List, Union, TYPE_CHECKING, Tuple, Dict, Optional, Hashable
 
-from miplearn.extractors import InstanceIterator
 from miplearn.instance import Instance
 from miplearn.types import LearningSolveStats, TrainingSample, Features
 
@@ -120,11 +120,11 @@ class Component:
 
     def xy_instances(
         self,
-        instances: Union[List[str], List[Instance]],
+        instances: List[Instance],
     ) -> Tuple[Dict, Dict]:
         x_combined: Dict = {}
         y_combined: Dict = {}
-        for instance in InstanceIterator(instances):
+        for instance in instances:
             assert isinstance(instance, Instance)
             for sample in instance.training_data:
                 xy = self.sample_xy(instance.features, sample)
@@ -141,7 +141,7 @@ class Component:
 
     def fit(
         self,
-        training_instances: Union[List[str], List[Instance]],
+        training_instances: List[Instance],
     ) -> None:
         x, y = self.xy_instances(training_instances)
         for cat in x.keys():
@@ -198,9 +198,9 @@ class Component:
     ) -> None:
         return
 
-    def evaluate(self, instances: Union[List[str], List[Instance]]) -> List:
+    def evaluate(self, instances: List[Instance]) -> List:
         ev = []
-        for instance in InstanceIterator(instances):
+        for instance in instances:
             for sample in instance.training_data:
                 ev += [self.sample_evaluate(instance.features, sample)]
         return ev
diff --git a/miplearn/components/lazy_dynamic.py b/miplearn/components/lazy_dynamic.py
index 252ca5c..ddefb23 100644
--- a/miplearn/components/lazy_dynamic.py
+++ b/miplearn/components/lazy_dynamic.py
@@ -13,7 +13,7 @@ from miplearn.classifiers import Classifier
 from miplearn.classifiers.counting import CountingClassifier
 from miplearn.components import classifier_evaluation_dict
 from miplearn.components.component import Component
-from miplearn.extractors import InstanceFeaturesExtractor, InstanceIterator
+from miplearn.extractors import InstanceFeaturesExtractor
 
 logger = logging.getLogger(__name__)
 
@@ -68,7 +68,7 @@ class DynamicLazyConstraintsComponent(Component):
 
         self.classifiers = {}
         violation_to_instance_idx = {}
-        for (idx, instance) in enumerate(InstanceIterator(training_instances)):
+        for (idx, instance) in enumerate(training_instances):
             for v in instance.found_violated_lazy_constraints:
                 if isinstance(v, list):
                     v = tuple(v)
diff --git a/miplearn/components/objective.py b/miplearn/components/objective.py
index c6cf984..5aec236 100644
--- a/miplearn/components/objective.py
+++ b/miplearn/components/objective.py
@@ -3,22 +3,14 @@
 #  Released under the modified BSD license. See COPYING.md for more details.
 
 import logging
-from typing import List, Dict, Union, Optional, Any, TYPE_CHECKING, Tuple, Hashable
+from typing import List, Dict, Any, TYPE_CHECKING, Tuple, Hashable
 
 import numpy as np
 from sklearn.linear_model import LinearRegression
-from sklearn.metrics import (
-    mean_squared_error,
-    explained_variance_score,
-    max_error,
-    mean_absolute_error,
-    r2_score,
-)
 
 from miplearn.classifiers import Regressor
 from miplearn.classifiers.sklearn import ScikitLearnRegressor
 from miplearn.components.component import Component
-from miplearn.extractors import InstanceIterator
 from miplearn.instance import Instance
 from miplearn.types import TrainingSample, LearningSolveStats, Features
 
diff --git a/miplearn/components/steps/convert_tight.py b/miplearn/components/steps/convert_tight.py
index f052519..34b010a 100644
--- a/miplearn/components/steps/convert_tight.py
+++ b/miplearn/components/steps/convert_tight.py
@@ -13,7 +13,6 @@ from miplearn.classifiers.counting import CountingClassifier
 from miplearn.components import classifier_evaluation_dict
 from miplearn.components.component import Component
 from miplearn.components.steps.drop_redundant import DropRedundantInequalitiesStep
-from miplearn.extractors import InstanceIterator
 
 logger = logging.getLogger(__name__)
 
@@ -116,7 +115,7 @@ class ConvertTightIneqsIntoEqsStep(Component):
     def _x_train(instances):
         x = {}
         for instance in tqdm(
-            InstanceIterator(instances),
+            instances,
             desc="Extract (drop:x)",
             disable=len(instances) < 5,
         ):
@@ -139,7 +138,7 @@ class ConvertTightIneqsIntoEqsStep(Component):
     def y(self, instances):
         y = {}
         for instance in tqdm(
-            InstanceIterator(instances),
+            instances,
             desc="Extract (rlx:conv_ineqs:y)",
             disable=len(instances) < 5,
         ):
diff --git a/miplearn/components/steps/drop_redundant.py b/miplearn/components/steps/drop_redundant.py
index a41e7c7..b3bfe7e 100644
--- a/miplearn/components/steps/drop_redundant.py
+++ b/miplearn/components/steps/drop_redundant.py
@@ -6,14 +6,13 @@ import logging
 from copy import deepcopy
 
 import numpy as np
-from tqdm import tqdm
 from p_tqdm import p_umap
+from tqdm import tqdm
 
 from miplearn.classifiers.counting import CountingClassifier
 from miplearn.components import classifier_evaluation_dict
 from miplearn.components.component import Component
 from miplearn.components.lazy_static import LazyConstraint
-from miplearn.extractors import InstanceIterator
 
 logger = logging.getLogger(__name__)
 
@@ -131,31 +130,24 @@ class DropRedundantInequalitiesStep(Component):
         def _extract(instance):
             x = {}
             y = {}
-            for instance in InstanceIterator([instance]):
-                for training_data in instance.training_data:
-                    for (cid, slack) in training_data["slacks"].items():
-                        category = instance.get_constraint_category(cid)
-                        if category is None:
-                            continue
-                        if category not in x:
-                            x[category] = []
-                        if category not in y:
-                            y[category] = []
-                        if slack > self.slack_tolerance:
-                            y[category] += [[False, True]]
-                        else:
-                            y[category] += [[True, False]]
-                        x[category] += [instance.get_constraint_features(cid)]
+            for training_data in instance.training_data:
+                for (cid, slack) in training_data["slacks"].items():
+                    category = instance.get_constraint_category(cid)
+                    if category is None:
+                        continue
+                    if category not in x:
+                        x[category] = []
+                    if category not in y:
+                        y[category] = []
+                    if slack > self.slack_tolerance:
+                        y[category] += [[False, True]]
+                    else:
+                        y[category] += [[True, False]]
+                    x[category] += [instance.get_constraint_features(cid)]
             return x, y
 
         if n_jobs == 1:
-            results = [
-                _extract(i)
-                for i in tqdm(
-                    instances,
-                    desc="Extract (drop 1/3)",
-                )
-            ]
+            results = [_extract(i) for i in tqdm(instances, desc="Extract (drop 1/3)")]
         else:
             results = p_umap(
                 _extract,
diff --git a/miplearn/extractors.py b/miplearn/extractors.py
index 4e2c755..350e5ca 100644
--- a/miplearn/extractors.py
+++ b/miplearn/extractors.py
@@ -2,51 +2,14 @@
 #  Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
 #  Released under the modified BSD license. See COPYING.md for more details.
 
-import gzip
 import logging
-import pickle
 from abc import ABC, abstractmethod
-from typing import List, Union, cast, IO
 
 import numpy as np
-from tqdm.auto import tqdm
-
-from miplearn.instance import Instance
 
 logger = logging.getLogger(__name__)
 
 
-class InstanceIterator:
-    def __init__(
-        self,
-        instances: Union[List[str], List[Instance]],
-    ) -> None:
-        self.instances = instances
-        self.current = 0
-
-    def __iter__(self):
-        return self
-
-    def __next__(self) -> Instance:
-        if self.current >= len(self.instances):
-            raise StopIteration
-        result = self.instances[self.current]
-        self.current += 1
-        if isinstance(result, str):
-            logger.debug("Read: %s" % result)
-            try:
-                if result.endswith(".gz"):
-                    with gzip.GzipFile(result, "rb") as gzfile:
-                        result = pickle.load(cast(IO[bytes], gzfile))
-                else:
-                    with open(result, "rb") as file:
-                        result = pickle.load(cast(IO[bytes], file))
-            except pickle.UnpicklingError:
-                raise Exception(f"Invalid instance file: {result}")
-        assert isinstance(result, Instance)
-        return result
-
-
 class Extractor(ABC):
     @abstractmethod
     def extract(self, instances):
@@ -77,6 +40,6 @@ class InstanceFeaturesExtractor(Extractor):
                         instance.training_data[0]["LP value"],
                     ]
                 )
-                for instance in InstanceIterator(instances)
+                for instance in instances
             ]
         )
diff --git a/miplearn/instance.py b/miplearn/instance.py
index b327f97..47a4867 100644
--- a/miplearn/instance.py
+++ b/miplearn/instance.py
@@ -3,14 +3,34 @@
 #  Released under the modified BSD license. See COPYING.md for more details.
 
 import gzip
-import json
+import logging
+import os
+import pickle
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional, Hashable
-
-import numpy as np
+from typing import Any, List, Optional, Hashable, IO, cast
 
 from miplearn.types import TrainingSample, VarIndex, Features
 
+logger = logging.getLogger(__name__)
+
+
+def write_pickle_gz(obj: Any, filename: str) -> None:
+    logger.info(f"Writing: {filename}")
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    with gzip.GzipFile(filename, "wb") as file:
+        pickle.dump(obj, cast(IO[bytes], file))
+
+
+def read_pickle_gz(filename: str) -> Any:
+    logger.info(f"Reading: {filename}")
+    with gzip.GzipFile(filename, "rb") as file:
+        return pickle.load(cast(IO[bytes], file))
+
+
+def write_pickle_gz_multiple(objs: List[Any], dirname: str) -> None:
+    for (i, obj) in enumerate(objs):
+        write_pickle_gz(obj, f"{dirname}/{i:05d}.pkl.gz")
+
 
 # noinspection PyMethodMayBeStatic
 class Instance(ABC):
@@ -155,12 +175,116 @@ class Instance(ABC):
     def build_user_cut(self, model, violation):
         pass
 
-    def load(self, filename):
-        with gzip.GzipFile(filename, "r") as f:
-            data = json.loads(f.read().decode("utf-8"))
-        self.__dict__ = data
+    def flush(self) -> None:
+        """
+        Save any pending changes made to the instance to the underlying data store.
+        """
+        pass
+
+
+def lazy_load(func):
+    def inner(self, *args):
+        if self.instance is None:
+            self.instance = self._load()
+            self.features = self.instance.features
+            self.training_data = self.instance.training_data
+        return func(self, *args)
+
+    return inner
+
+
+class PickleGzInstance(Instance):
+    """
+    An instance backed by a gzipped pickle file.
+
+    The instance is only loaded to memory after an operation is called (for example,
+    `to_model`).
+
+    Parameters
+    ----------
+    filename: str
+        Path of the gzipped pickle file that should be loaded.
+    """
+
+    def __init__(self, filename: str) -> None:
+        super().__init__()
+        assert os.path.exists(filename), f"File not found: {filename}"
+        self.instance: Optional[Instance] = None
+        self.filename: str = filename
+
+    @lazy_load
+    def to_model(self) -> Any:
+        assert self.instance is not None
+        return self.instance.to_model()
+
+    @lazy_load
+    def get_instance_features(self) -> List[float]:
+        assert self.instance is not None
+        return self.instance.get_instance_features()
+
+    @lazy_load
+    def get_variable_features(self, var_name: str, index: VarIndex) -> List[float]:
+        assert self.instance is not None
+        return self.instance.get_variable_features(var_name, index)
+
+    @lazy_load
+    def get_variable_category(
+        self,
+        var_name: str,
+        index: VarIndex,
+    ) -> Optional[Hashable]:
+        assert self.instance is not None
+        return self.instance.get_variable_category(var_name, index)
+
+    @lazy_load
+    def get_constraint_features(self, cid: str) -> Optional[List[float]]:
+        assert self.instance is not None
+        return self.instance.get_constraint_features(cid)
+
+    @lazy_load
+    def get_constraint_category(self, cid: str) -> Optional[str]:
+        assert self.instance is not None
+        return self.instance.get_constraint_category(cid)
+
+    @lazy_load
+    def has_static_lazy_constraints(self) -> bool:
+        assert self.instance is not None
+        return self.instance.has_static_lazy_constraints()
+
+    @lazy_load
+    def has_dynamic_lazy_constraints(self):
+        assert self.instance is not None
+        return self.instance.has_dynamic_lazy_constraints()
+
+    @lazy_load
+    def is_constraint_lazy(self, cid: str) -> bool:
+        assert self.instance is not None
+        return self.instance.is_constraint_lazy(cid)
+
+    @lazy_load
+    def find_violated_lazy_constraints(self, model):
+        assert self.instance is not None
+        return self.instance.find_violated_lazy_constraints(model)
+
+    @lazy_load
+    def build_lazy_constraint(self, model, violation):
+        assert self.instance is not None
+        return self.instance.build_lazy_constraint(model, violation)
+
+    @lazy_load
+    def find_violated_user_cuts(self, model):
+        assert self.instance is not None
+        return self.instance.find_violated_user_cuts(model)
+
+    @lazy_load
+    def build_user_cut(self, model, violation):
+        assert self.instance is not None
+        return self.instance.build_user_cut(model, violation)
+
+    def _load(self) -> Instance:
+        obj = read_pickle_gz(self.filename)
+        assert isinstance(obj, Instance)
+        return obj
 
-    def dump(self, filename):
-        data = json.dumps(self.__dict__, indent=2).encode("utf-8")
-        with gzip.GzipFile(filename, "w") as f:
-            f.write(data)
+    def flush(self) -> None:
+        write_pickle_gz(self.instance, self.filename)
diff --git a/miplearn/solvers/learning.py b/miplearn/solvers/learning.py
index 4a3ab47..eede7e6 100644
--- a/miplearn/solvers/learning.py
+++ b/miplearn/solvers/learning.py
@@ -18,7 +18,7 @@ from miplearn.components.lazy_dynamic import DynamicLazyConstraintsComponent
 from miplearn.components.objective import ObjectiveValueComponent
 from miplearn.components.primal import PrimalSolutionComponent
 from miplearn.features import FeaturesExtractor
-from miplearn.instance import Instance
+from miplearn.instance import Instance, PickleGzInstance
 from miplearn.solvers import _RedirectOutput
 from miplearn.solvers.internal import InternalSolver
 from miplearn.solvers.pyomo.gurobi import GurobiPyomoSolver
@@ -30,8 +30,7 @@ logger = logging.getLogger(__name__)
 class _GlobalVariables:
     def __init__(self) -> None:
         self.solver: Optional[LearningSolver] = None
-        self.instances: Optional[Union[List[str], List[Instance]]] = None
-        self.output_filenames: Optional[List[str]] = None
+        self.instances: Optional[List[Instance]] = None
         self.discard_outputs: bool = False
 
 
@@ -44,16 +43,10 @@ _GLOBAL = [_GlobalVariables()]
 def _parallel_solve(idx):
     solver = _GLOBAL[0].solver
     instances = _GLOBAL[0].instances
-    output_filenames = _GLOBAL[0].output_filenames
     discard_outputs = _GLOBAL[0].discard_outputs
-    if output_filenames is None:
-        output_filename = None
-    else:
-        output_filename = output_filenames[idx]
     try:
         stats = solver.solve(
             instances[idx],
-            output_filename=output_filename,
             discard_output=discard_outputs,
         )
         return stats, instances[idx]
@@ -129,30 +122,12 @@ class LearningSolver:
 
     def _solve(
         self,
-        instance: Union[Instance, str],
+        instance: Instance,
         model: Any = None,
-        output_filename: Optional[str] = None,
         discard_output: bool = False,
         tee: bool = False,
     ) -> LearningSolveStats:
 
-        # Load instance from file, if necessary
-        filename = None
-        fileformat = None
-        file: Union[BinaryIO, gzip.GzipFile]
-        if isinstance(instance, str):
-            filename = instance
-            logger.info("Reading: %s" % filename)
-            if filename.endswith(".gz"):
-                fileformat = "pickle-gz"
-                with gzip.GzipFile(filename, "rb") as file:
-                    instance = pickle.load(cast(IO[bytes], file))
-            else:
-                fileformat = "pickle"
-                with open(filename, "rb") as file:
-                    instance = pickle.load(cast(IO[bytes], file))
-        assert isinstance(instance, Instance)
-
         # Generate model
         if model is None:
             with _RedirectOutput([]):
@@ -262,23 +237,15 @@ class LearningSolver:
             component.after_solve_mip(*callback_args)
 
         # Write to file, if necessary
-        if not discard_output and filename is not None:
-            if output_filename is None:
-                output_filename = filename
-            logger.info("Writing: %s" % output_filename)
-            if fileformat == "pickle":
-                with open(output_filename, "wb") as file:
-                    pickle.dump(instance, cast(IO[bytes], file))
-            else:
-                with gzip.GzipFile(output_filename, "wb") as file:
-                    pickle.dump(instance, cast(IO[bytes], file))
+        if not discard_output:
+            instance.flush()
+
         return stats
 
     def solve(
         self,
-        instance: Union[Instance, str],
+        instance: Instance,
         model: Any = None,
-        output_filename: Optional[str] = None,
         discard_output: bool = False,
         tee: bool = False,
     ) -> LearningSolveStats:
@@ -298,14 +265,10 @@ class LearningSolver:
 
         Parameters
         ----------
-        instance: Union[Instance, str]
-            The instance to be solved, or a filename.
+        instance: Instance
+            The instance to be solved.
         model: Any
             The corresponding Pyomo model. If not provided, it will be created.
-        output_filename: Optional[str]
-            If instance is a filename and output_filename is provided, write the
-            modified instance to this file, instead of replacing the original one. If
-            output_filename is None (the default), modified the original file in-place.
         discard_output: bool
             If True, do not write the modified instances anywhere; simply discard
             them. Useful during benchmarking.
@@ -325,30 +288,28 @@ class LearningSolver:
             details.
         """
         if self.simulate_perfect:
-            if not isinstance(instance, str):
+            if not isinstance(instance, PickleGzInstance):
                 raise Exception("Not implemented")
-            with tempfile.NamedTemporaryFile(suffix=os.path.basename(instance)) as tmp:
-                self._solve(
-                    instance=instance,
-                    model=model,
-                    output_filename=tmp.name,
-                    tee=tee,
-                )
-                self.fit([tmp.name])
+            self._solve(
+                instance=instance,
+                model=model,
+                tee=tee,
+                discard_output=True,
+            )
+            self.fit([instance])
+            instance.instance = None
         return self._solve(
             instance=instance,
             model=model,
-            output_filename=output_filename,
             discard_output=discard_output,
             tee=tee,
         )
 
     def parallel_solve(
         self,
-        instances: Union[List[str], List[Instance]],
+        instances: List[Instance],
         n_jobs: int = 4,
         label: str = "Solve",
-        output_filenames: Optional[List[str]] = None,
         discard_outputs: bool = False,
     ) -> List[LearningSolveStats]:
         """
@@ -361,17 +322,13 @@ class LearningSolver:
 
         Parameters
         ----------
-        output_filenames: Optional[List[str]]
-            If instances are file names and output_filenames is provided, write the
-            modified instances to these files, instead of replacing the original
-            files. If output_filenames is None, modifies the instances in-place.
         discard_outputs: bool
             If True, do not write the modified instances anywhere; simply discard
             them instead. Useful during benchmarking.
         label: str
             Label to show in the progress bar.
-        instances: Union[List[str], List[Instance]]
-            The instances to be solved
+        instances: List[Instance]
+            The instances to be solved.
         n_jobs: int
             Number of instances to solve in parallel at a time.
 
@@ -388,7 +345,6 @@ class LearningSolver:
             self.internal_solver = None
             self._silence_miplearn_logger()
             _GLOBAL[0].solver = self
-            _GLOBAL[0].output_filenames = output_filenames
             _GLOBAL[0].instances = instances
             _GLOBAL[0].discard_outputs = discard_outputs
             results = p_map(
@@ -405,7 +361,7 @@ class LearningSolver:
             self._restore_miplearn_logger()
             return stats
 
-    def fit(self, training_instances: Union[List[str], List[Instance]]) -> None:
+    def fit(self, training_instances: List[Instance]) -> None:
         logger.debug("Fitting...")
         if len(training_instances) == 0:
             return
diff --git a/tests/solvers/test_learning_solver.py b/tests/solvers/test_learning_solver.py
index 36abbe3..913eb6b 100644
--- a/tests/solvers/test_learning_solver.py
+++ b/tests/solvers/test_learning_solver.py
@@ -8,6 +8,7 @@ import pickle
 import tempfile
 import os
 
+from miplearn.instance import PickleGzInstance, write_pickle_gz, read_pickle_gz
 from miplearn.solvers.gurobi import GurobiSolver
 from miplearn.solvers.learning import LearningSolver
 from . import _get_knapsack_instance, get_internal_solvers
@@ -78,61 +79,40 @@ def test_parallel_solve():
 def test_solve_fit_from_disk():
     for internal_solver in get_internal_solvers():
         # Create instances and pickle them
-        filenames = []
+        instances = []
         for k in range(3):
             instance = _get_knapsack_instance(internal_solver)
             with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file:
-                filenames += [file.name]
-                pickle.dump(instance, file)
+                instances += [PickleGzInstance(file.name)]
+                write_pickle_gz(instance, file.name)
 
         # Test: solve
         solver = LearningSolver(solver=internal_solver)
-        solver.solve(filenames[0])
-        with open(filenames[0], "rb") as file:
-            instance = pickle.load(file)
-            assert len(instance.training_data) > 0
+        solver.solve(instances[0])
+        instance_loaded = read_pickle_gz(instances[0].filename)
+        assert len(instance_loaded.training_data) > 0
 
         # Test: parallel_solve
-        solver.parallel_solve(filenames)
-        for filename in filenames:
-            with open(filename, "rb") as file:
-                instance = pickle.load(file)
-                assert len(instance.training_data) > 0
-
-        # Test: solve (with specified output)
-        output = [f + ".out" for f in filenames]
-        solver.solve(
-            filenames[0],
-            output_filename=output[0],
-        )
-        assert os.path.isfile(output[0])
-
-        # Test: parallel_solve (with specified output)
-        solver.parallel_solve(
-            filenames,
-            output_filenames=output,
-        )
-        for filename in output:
-            assert os.path.isfile(filename)
+        solver.parallel_solve(instances)
+        for instance in instances:
+            instance_loaded = read_pickle_gz(instance.filename)
+            assert len(instance.training_data) > 0
 
         # Delete temporary files
-        for filename in filenames:
-            os.remove(filename)
-        for filename in output:
-            os.remove(filename)
+        for instance in instances:
+            os.remove(instance.filename)
 
 
 def test_simulate_perfect():
     internal_solver = GurobiSolver
     instance = _get_knapsack_instance(internal_solver)
     with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as tmp:
-        pickle.dump(instance, tmp)
-        tmp.flush()
+        write_pickle_gz(instance, tmp.name)
         solver = LearningSolver(
             solver=internal_solver,
             simulate_perfect=True,
         )
-        stats = solver.solve(tmp.name)
+        stats = solver.solve(PickleGzInstance(tmp.name))
         assert stats["Lower bound"] == stats["Objective: Predicted lower bound"]
 
 
diff --git a/tests/test_instance.py b/tests/test_instance.py
new file mode 100644
index 0000000..0ee3b9c
--- /dev/null
+++ b/tests/test_instance.py
@@ -0,0 +1,16 @@
+#  MIPLearn: Extensible Framework for Learning-Enhanced Mixed-Integer Optimization
+#  Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
+#  Released under the modified BSD license. See COPYING.md for more details.
+import tempfile
+
+from miplearn import GurobiSolver
+from miplearn.instance import write_pickle_gz, PickleGzInstance
+from tests.fixtures.knapsack import get_knapsack_instance
+
+
+def test_pickled() -> None:
+    original = get_knapsack_instance(GurobiSolver())
+    file = tempfile.NamedTemporaryFile()
+    write_pickle_gz(original, file.name)
+    pickled = PickleGzInstance(file.name)
+    assert pickled.to_model() is not None