Implement SetPackPerturber and SetCoverPerturber

2025-12-09 02:48:52 -06:00 · 2025-12-08 13:47:33 -06:00
parent 427bd1d806
commit 4137378bb8
4 changed files with 213 additions and 106 deletions
--- a/docs/guide/problems.ipynb
+++ b/docs/guide/problems.ipynb
@@ -665,9 +665,10 @@
    "\n",
    "Finally, the weight of set $j$ is set to $w_j + K | S_j |$, where $w_j$ and $k$ are sampled from `costs` and `K`, respectively, and where $|S_j|$ denotes the size of set $S_j$. The parameter $K$ is used to introduce some correlation between the size of the set and its weight, making the instance more challenging. Note that `K` is only sampled once for the entire instance.\n",
    "\n",
-    "If `fix_sets=True`, then all generated instances have exactly the same sets and elements. The costs of the sets, however, are multiplied by random scaling factors sampled from the provided probability distribution `costs_jitter`.\n",
+    "To create multiple instances with the same incidence matrix but different costs, you can use [SetCoverPerturber][SetCoverPerturber]. This class takes an existing SetCoverData instance and generates new instances by applying randomization factors to the existing costs while keeping the incidence matrix fixed.\n",
    "\n",
-    "[SetCoverGenerator]: ../../api/problems/#miplearn.problems.setcover.SetCoverGenerator"
+    "[SetCoverGenerator]: ../../api/problems/#miplearn.problems.setcover.SetCoverGenerator\n",
+    "[SetCoverPerturber]: ../../api/problems/#miplearn.problems.setcover.SetCoverPerturber"
   ]
  },
  {
@@ -680,7 +681,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
   "id": "3224845b-9afd-463e-abf4-e0e93d304859",
   "metadata": {
    "ExecuteTime": {
@@ -699,8 +700,8 @@
      " [0 1 1 1 1 0 1 0 0 1]\n",
      " [0 1 1 0 0 0 1 1 0 1]\n",
      " [1 1 1 0 1 0 1 0 0 1]]\n",
-      "costs [1044.58  850.13 1014.5   944.83  697.9   971.87  213.49  220.98   70.23\n",
-      "  425.33]\n",
+      "costs [1021.32  811.25 1081.2   917.76  667.32  980.17  198.16  234.34   64.25\n",
+      "  466.75]\n",
      "\n",
      "Gurobi Optimizer version 13.0.0 build v13.0.0rc1 (linux64 - \"Ubuntu 22.04.5 LTS\")\n",
      "\n",
@@ -708,15 +709,15 @@
      "Thread count: 16 physical cores, 16 logical processors, using up to 16 threads\n",
      "\n",
      "Optimize a model with 5 rows, 10 columns and 28 nonzeros (Min)\n",
-      "Model fingerprint: 0xe5c2d4fa\n",
+      "Model fingerprint: 0x57e5c4ba\n",
      "Model has 10 linear objective coefficients\n",
      "Variable types: 0 continuous, 10 integer (10 binary)\n",
      "Coefficient statistics:\n",
      "  Matrix range     [1e+00, 1e+00]\n",
-      "  Objective range  [7e+01, 1e+03]\n",
+      "  Objective range  [6e+01, 1e+03]\n",
      "  Bounds range     [1e+00, 1e+00]\n",
      "  RHS range        [1e+00, 1e+00]\n",
-      "Found heuristic solution: objective 213.4900000\n",
+      "Found heuristic solution: objective 198.1600000\n",
      "Presolve removed 5 rows and 10 columns\n",
      "Presolve time: 0.00s\n",
      "Presolve: All rows and columns removed\n",
@@ -724,10 +725,10 @@
      "Explored 0 nodes (0 simplex iterations) in 0.00 seconds (0.00 work units)\n",
      "Thread count was 1 (of 16 available processors)\n",
      "\n",
-      "Solution count 1: 213.49 \n",
+      "Solution count 1: 198.16 \n",
      "\n",
      "Optimal solution found (tolerance 1.00e-04)\n",
-      "Best objective 2.134900000000e+02, best bound 2.134900000000e+02, gap 0.0000%\n",
+      "Best objective 1.981600000000e+02, best bound 1.981600000000e+02, gap 0.0000%\n",
      "\n",
      "User-callback calls 181, time in user-callback 0.00 sec\n"
     ]
@@ -736,23 +737,26 @@
   "source": [
    "import numpy as np\n",
    "from scipy.stats import uniform, randint\n",
-    "from miplearn.problems.setcover import SetCoverGenerator, build_setcover_model_gurobipy\n",
+    "from miplearn.problems.setcover import SetCoverGenerator, SetCoverPerturber, build_setcover_model_gurobipy\n",
    "\n",
    "# Set random seed, to make example reproducible\n",
    "np.random.seed(42)\n",
    "\n",
-    "# Build random instances with five elements, ten sets and costs\n",
-    "# in the [0, 1000] interval, with a correlation factor of 25 and\n",
-    "# an incidence matrix with 25% density.\n",
-    "data = SetCoverGenerator(\n",
+    "# Generate a reference instance with five elements and ten sets\n",
+    "generator = SetCoverGenerator(\n",
    "    n_elements=randint(low=5, high=6),\n",
    "    n_sets=randint(low=10, high=11),\n",
    "    costs=uniform(loc=0.0, scale=1000.0),\n",
-    "    costs_jitter=uniform(loc=0.90, scale=0.20),\n",
    "    density=uniform(loc=0.5, scale=0.00),\n",
    "    K=uniform(loc=25.0, scale=0.0),\n",
-    "    fix_sets=True,\n",
-    ").generate(10)\n",
+    ")\n",
+    "reference_instance = generator.generate(1)[0]\n",
+    "\n",
+    "# Generate perturbed instances using the reference\n",
+    "perturber = SetCoverPerturber(\n",
+    "    costs_jitter=uniform(loc=0.9, scale=0.2),\n",
+    ")\n",
+    "data = perturber.perturb(reference_instance, 10)\n",
    "\n",
    "# Print problem data for one instance\n",
    "print(\"matrix\\n\", data[0].incidence_matrix)\n",
@@ -810,15 +814,18 @@
    "\n",
    "The class [SetPackGenerator][SetPackGenerator] can generate random instances of this problem. It accepts exactly the same arguments, and generates instance data in exactly the same way as [SetCoverGenerator][SetCoverGenerator]. For more details, please see the documentation for that class.\n",
    "\n",
+    "To create multiple instances with the same incidence matrix but different costs, you can use [SetPackPerturber][SetPackPerturber]. This class takes an existing SetPackData instance and generates new instances by applying randomization factors to the existing costs while keeping the incidence matrix fixed.\n",
+    "\n",
    "[SetPackGenerator]: ../../api/problems/#miplearn.problems.setpack.SetPackGenerator\n",
    "[SetCoverGenerator]: ../../api/problems/#miplearn.problems.setcover.SetCoverGenerator\n",
+    "[SetPackPerturber]: ../../api/problems/#miplearn.problems.setpack.SetPackPerturber\n",
    "\n",
    "### Example"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
   "id": "cc797da7",
   "metadata": {
    "ExecuteTime": {
@@ -841,8 +848,8 @@
      " [0 1 1 1 1 0 1 0 0 1]\n",
      " [0 1 1 0 0 0 1 1 0 1]\n",
      " [1 1 1 0 1 0 1 0 0 1]]\n",
-      "costs [1044.58  850.13 1014.5   944.83  697.9   971.87  213.49  220.98   70.23\n",
-      "  425.33]\n",
+      "costs [1021.32  811.25 1081.2   917.76  667.32  980.17  198.16  234.34   64.25\n",
+      "  466.75]\n",
      "\n",
      "Gurobi Optimizer version 13.0.0 build v13.0.0rc1 (linux64 - \"Ubuntu 22.04.5 LTS\")\n",
      "\n",
@@ -850,15 +857,15 @@
      "Thread count: 16 physical cores, 16 logical processors, using up to 16 threads\n",
      "\n",
      "Optimize a model with 5 rows, 10 columns and 28 nonzeros (Min)\n",
-      "Model fingerprint: 0x4ee91388\n",
+      "Model fingerprint: 0x75cd8328\n",
      "Model has 10 linear objective coefficients\n",
      "Variable types: 0 continuous, 10 integer (10 binary)\n",
      "Coefficient statistics:\n",
      "  Matrix range     [1e+00, 1e+00]\n",
-      "  Objective range  [7e+01, 1e+03]\n",
+      "  Objective range  [6e+01, 1e+03]\n",
      "  Bounds range     [1e+00, 1e+00]\n",
      "  RHS range        [1e+00, 1e+00]\n",
-      "Found heuristic solution: objective -1265.560000\n",
+      "Found heuristic solution: objective -1255.660000\n",
      "Presolve removed 5 rows and 10 columns\n",
      "Presolve time: 0.00s\n",
      "Presolve: All rows and columns removed\n",
@@ -866,11 +873,11 @@
      "Explored 0 nodes (0 simplex iterations) in 0.00 seconds (0.00 work units)\n",
      "Thread count was 1 (of 16 available processors)\n",
      "\n",
-      "Solution count 2: -1986.37 -1265.56 \n",
-      "No other solutions better than -1986.37\n",
+      "Solution count 2: -2061.37 -1255.66 \n",
+      "No other solutions better than -2061.37\n",
      "\n",
      "Optimal solution found (tolerance 1.00e-04)\n",
-      "Best objective -1.986370000000e+03, best bound -1.986370000000e+03, gap 0.0000%\n",
+      "Best objective -2.061370000000e+03, best bound -2.061370000000e+03, gap 0.0000%\n",
      "\n",
      "User-callback calls 242, time in user-callback 0.00 sec\n"
     ]
@@ -879,23 +886,26 @@
   "source": [
    "import numpy as np\n",
    "from scipy.stats import uniform, randint\n",
-    "from miplearn.problems.setpack import SetPackGenerator, build_setpack_model_gurobipy\n",
+    "from miplearn.problems.setpack import SetPackGenerator, SetPackPerturber, build_setpack_model_gurobipy\n",
    "\n",
    "# Set random seed, to make example reproducible\n",
    "np.random.seed(42)\n",
    "\n",
-    "# Build random instances with five elements, ten sets and costs\n",
-    "# in the [0, 1000] interval, with a correlation factor of 25 and\n",
-    "# an incidence matrix with 25% density.\n",
-    "data = SetPackGenerator(\n",
+    "# Generate a reference instance with five elements and ten sets\n",
+    "generator = SetPackGenerator(\n",
    "    n_elements=randint(low=5, high=6),\n",
    "    n_sets=randint(low=10, high=11),\n",
    "    costs=uniform(loc=0.0, scale=1000.0),\n",
-    "    costs_jitter=uniform(loc=0.90, scale=0.20),\n",
    "    density=uniform(loc=0.5, scale=0.00),\n",
    "    K=uniform(loc=25.0, scale=0.0),\n",
-    "    fix_sets=True,\n",
-    ").generate(10)\n",
+    ")\n",
+    "reference_instance = generator.generate(1)[0]\n",
+    "\n",
+    "# Generate perturbed instances using the reference\n",
+    "perturber = SetPackPerturber(\n",
+    "    costs_jitter=uniform(loc=0.9, scale=0.2),\n",
+    ")\n",
+    "data = perturber.perturb(reference_instance, 10)\n",
    "\n",
    "# Print problem data for one instance\n",
    "print(\"matrix\\n\", data[0].incidence_matrix)\n",
--- a/miplearn/problems/setcover.py
+++ b/miplearn/problems/setcover.py
@@ -24,29 +24,57 @@ class SetCoverData:


 class SetCoverGenerator:
+    """Random instance generator for the Set Cover Problem.
+
+    Generates instances by creating a new random incidence matrix for each
+    instance, where the number of elements, sets, density, and costs are sampled
+    from user-provided probability distributions.
+    """
+
    def __init__(
        self,
        n_elements: rv_frozen = randint(low=50, high=51),
        n_sets: rv_frozen = randint(low=100, high=101),
        costs: rv_frozen = uniform(loc=0.0, scale=100.0),
-        costs_jitter: rv_frozen = uniform(loc=-5.0, scale=10.0),
        K: rv_frozen = uniform(loc=25.0, scale=0.0),
        density: rv_frozen = uniform(loc=0.02, scale=0.00),
-        fix_sets: bool = True,
    ):
+        """Initialize the problem generator.
+
+        Parameters
+        ----------
+        n_elements: rv_discrete
+            Probability distribution for number of elements.
+        n_sets: rv_discrete
+            Probability distribution for number of sets.
+        costs: rv_continuous
+            Probability distribution for base set costs.
+        K: rv_continuous
+            Probability distribution for cost scaling factor based on set size.
+        density: rv_continuous
+            Probability distribution for incidence matrix density.
+        """
+        assert isinstance(
+            n_elements, rv_frozen
+        ), "n_elements should be a SciPy probability distribution"
+        assert isinstance(
+            n_sets, rv_frozen
+        ), "n_sets should be a SciPy probability distribution"
+        assert isinstance(
+            costs, rv_frozen
+        ), "costs should be a SciPy probability distribution"
+        assert isinstance(K, rv_frozen), "K should be a SciPy probability distribution"
+        assert isinstance(
+            density, rv_frozen
+        ), "density should be a SciPy probability distribution"
        self.n_elements = n_elements
        self.n_sets = n_sets
        self.costs = costs
-        self.costs_jitter = costs_jitter
        self.density = density
        self.K = K
-        self.fix_sets = fix_sets
-        self.fixed_costs = None
-        self.fixed_matrix = None

    def generate(self, n_samples: int) -> List[SetCoverData]:
        def _sample() -> SetCoverData:
-            if self.fixed_matrix is None:
            n_sets = self.n_sets.rvs()
            n_elements = self.n_elements.rvs()
            density = self.density.rvs()
@@ -64,16 +92,7 @@ class SetCoverGenerator:
                if incidence_matrix[:, i].sum() == 0:
                    incidence_matrix[randint(low=0, high=n_elements).rvs(), i] = 1

-                costs = self.costs.rvs(n_sets) + self.K.rvs() * incidence_matrix.sum(
-                    axis=0
-                )
-                if self.fix_sets:
-                    self.fixed_matrix = incidence_matrix
-                    self.fixed_costs = costs
-            else:
-                incidence_matrix = self.fixed_matrix
-                (_, n_sets) = incidence_matrix.shape
-                costs = self.fixed_costs * self.costs_jitter.rvs(n_sets)
+            costs = self.costs.rvs(n_sets) + self.K.rvs() * incidence_matrix.sum(axis=0)
            return SetCoverData(
                costs=costs.round(2),
                incidence_matrix=incidence_matrix,
@@ -82,6 +101,47 @@ class SetCoverGenerator:
        return [_sample() for _ in range(n_samples)]


+class SetCoverPerturber:
+    """Perturbation generator for existing Set Cover instances.
+
+    Takes an existing SetCoverData instance and generates new instances
+    by applying randomization factors to the existing costs while keeping the
+    incidence matrix fixed.
+    """
+
+    def __init__(
+        self,
+        costs_jitter: rv_frozen = uniform(loc=0.9, scale=0.2),
+    ):
+        """Initialize the perturbation generator.
+
+        Parameters
+        ----------
+        costs_jitter: rv_continuous
+            Probability distribution for randomization factors applied to set costs.
+        """
+        assert isinstance(
+            costs_jitter, rv_frozen
+        ), "costs_jitter should be a SciPy probability distribution"
+        self.costs_jitter = costs_jitter
+
+    def perturb(
+        self,
+        instance: SetCoverData,
+        n_samples: int,
+    ) -> List[SetCoverData]:
+        def _sample() -> SetCoverData:
+            (_, n_sets) = instance.incidence_matrix.shape
+            jitter_factors = self.costs_jitter.rvs(n_sets)
+            costs = np.round(instance.costs * jitter_factors, 2)
+            return SetCoverData(
+                costs=costs,
+                incidence_matrix=instance.incidence_matrix,
+            )
+
+        return [_sample() for _ in range(n_samples)]
+
+
 def build_setcover_model_gurobipy(data: Union[str, SetCoverData]) -> GurobiModel:
    data = _read_setcover_data(data)
    (n_elements, n_sets) = data.incidence_matrix.shape
--- a/miplearn/problems/setpack.py
+++ b/miplearn/problems/setpack.py
@@ -11,7 +11,7 @@ from gurobipy import GRB
 from scipy.stats import uniform, randint
 from scipy.stats.distributions import rv_frozen

-from .setcover import SetCoverGenerator
+from .setcover import SetCoverGenerator, SetCoverPerturber
 from miplearn.solvers.gurobi import GurobiModel
 from ..io import read_pkl_gz

@@ -23,24 +23,55 @@ class SetPackData:


 class SetPackGenerator:
+    """Random instance generator for the Set Packing Problem.
+
+    Generates instances by creating a new random incidence matrix for each
+    instance, where the number of elements, sets, density, and costs are sampled
+    from user-provided probability distributions.
+    """
+
    def __init__(
        self,
        n_elements: rv_frozen = randint(low=50, high=51),
        n_sets: rv_frozen = randint(low=100, high=101),
        costs: rv_frozen = uniform(loc=0.0, scale=100.0),
-        costs_jitter: rv_frozen = uniform(loc=-5.0, scale=10.0),
        K: rv_frozen = uniform(loc=25.0, scale=0.0),
        density: rv_frozen = uniform(loc=0.02, scale=0.00),
-        fix_sets: bool = True,
    ) -> None:
+        """Initialize the problem generator.
+
+        Parameters
+        ----------
+        n_elements: rv_discrete
+            Probability distribution for number of elements.
+        n_sets: rv_discrete
+            Probability distribution for number of sets.
+        costs: rv_continuous
+            Probability distribution for base set costs.
+        K: rv_continuous
+            Probability distribution for cost scaling factor based on set size.
+        density: rv_continuous
+            Probability distribution for incidence matrix density.
+        """
+        assert isinstance(
+            n_elements, rv_frozen
+        ), "n_elements should be a SciPy probability distribution"
+        assert isinstance(
+            n_sets, rv_frozen
+        ), "n_sets should be a SciPy probability distribution"
+        assert isinstance(
+            costs, rv_frozen
+        ), "costs should be a SciPy probability distribution"
+        assert isinstance(K, rv_frozen), "K should be a SciPy probability distribution"
+        assert isinstance(
+            density, rv_frozen
+        ), "density should be a SciPy probability distribution"
        self.gen = SetCoverGenerator(
            n_elements=n_elements,
            n_sets=n_sets,
            costs=costs,
-            costs_jitter=costs_jitter,
            K=K,
            density=density,
-            fix_sets=fix_sets,
        )

    def generate(self, n_samples: int) -> List[SetPackData]:
@@ -53,6 +84,47 @@ class SetPackGenerator:
        ]


+class SetPackPerturber:
+    """Perturbation generator for existing Set Packing instances.
+
+    Takes an existing SetPackData instance and generates new instances
+    by applying randomization factors to the existing costs while keeping the
+    incidence matrix fixed.
+    """
+
+    def __init__(
+        self,
+        costs_jitter: rv_frozen = uniform(loc=0.9, scale=0.2),
+    ):
+        """Initialize the perturbation generator.
+
+        Parameters
+        ----------
+        costs_jitter: rv_continuous
+            Probability distribution for randomization factors applied to set costs.
+        """
+        assert isinstance(
+            costs_jitter, rv_frozen
+        ), "costs_jitter should be a SciPy probability distribution"
+        self.costs_jitter = costs_jitter
+
+    def perturb(
+        self,
+        instance: SetPackData,
+        n_samples: int,
+    ) -> List[SetPackData]:
+        def _sample() -> SetPackData:
+            (_, n_sets) = instance.incidence_matrix.shape
+            jitter_factors = self.costs_jitter.rvs(n_sets)
+            costs = np.round(instance.costs * jitter_factors, 2)
+            return SetPackData(
+                costs=costs,
+                incidence_matrix=instance.incidence_matrix,
+            )
+
+        return [_sample() for _ in range(n_samples)]
+
+
 def build_setpack_model_gurobipy(data: Union[str, SetPackData]) -> GurobiModel:
    if isinstance(data, str):
        data = read_pkl_gz(data)
--- a/tests/problems/test_setcover.py
+++ b/tests/problems/test_setcover.py
@@ -23,51 +23,16 @@ def test_set_cover_generator() -> None:
        n_elements=randint(low=3, high=4),
        n_sets=randint(low=5, high=6),
        costs=uniform(loc=0.0, scale=100.0),
-        costs_jitter=uniform(loc=0.95, scale=0.10),
        density=uniform(loc=0.5, scale=0),
        K=uniform(loc=25, scale=0),
-        fix_sets=False,
    )
-    data = gen.generate(2)
-
+    data = gen.generate(1)
    assert data[0].costs.round(1).tolist() == [136.8, 86.2, 25.7, 27.3, 102.5]
    assert data[0].incidence_matrix.tolist() == [
        [1, 0, 1, 0, 1],
        [1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1],
    ]
-    assert data[1].costs.round(1).tolist() == [63.5, 76.6, 48.1, 74.1, 93.3]
-    assert data[1].incidence_matrix.tolist() == [
-        [1, 1, 0, 1, 1],
-        [0, 1, 0, 1, 0],
-        [0, 1, 1, 0, 0],
-    ]
-
-
-def test_set_cover_generator_with_fixed_sets() -> None:
-    np.random.seed(42)
-    gen = SetCoverGenerator(
-        n_elements=randint(low=3, high=4),
-        n_sets=randint(low=5, high=6),
-        costs=uniform(loc=0.0, scale=100.0),
-        costs_jitter=uniform(loc=0.95, scale=0.10),
-        density=uniform(loc=0.5, scale=0.00),
-        fix_sets=True,
-    )
-    data = gen.generate(3)
-
-    assert data[0].costs.tolist() == [136.75, 86.17, 25.71, 27.31, 102.48]
-    assert data[1].costs.tolist() == [135.38, 82.26, 26.92, 26.58, 98.28]
-    assert data[2].costs.tolist() == [138.37, 85.15, 26.95, 27.22, 106.17]
-
-    print(data[0].incidence_matrix)
-
-    for i in range(3):
-        assert data[i].incidence_matrix.tolist() == [
-            [1, 0, 1, 0, 1],
-            [1, 1, 0, 0, 0],
-            [1, 0, 0, 1, 1],
-        ]


 def test_set_cover() -> None: