diff --git a/Makefile b/Makefile
index 2ed3f1c..3d80474 100644
--- a/Makefile
+++ b/Makefile
@@ -43,6 +43,6 @@ reformat:
 test:
 	$(MYPY) -p miplearn
 	$(MYPY) -p tests
-	$(PYTEST) $(PYTEST_ARGS)
+	$(PYTEST) $(PYTEST_ARGS) tests/classifiers
 
 .PHONY: test test-watch docs install
diff --git a/docs/customization.md b/docs/customization.md
index dd660ca..d9bbedd 100644
--- a/docs/customization.md
+++ b/docs/customization.md
@@ -152,25 +152,18 @@ dtype: float64
 
 ### Using customized ML classifiers and regressors
 
-By default, given a training set of instantes, MIPLearn trains a fixed set of ML classifiers and regressors, then
-selects the best one based on cross-validation performance. Alternatively, the user may specify which ML model a component
-should use through the `classifier` or `regressor` contructor parameters. The provided classifiers and regressors must 
-follow the sklearn API. In particular, classifiers must provide the methods `fit`, `predict_proba` and `predict`,
-while regressors must provide the methods `fit` and `predict`
-
-!!! danger
-    MIPLearn must be able to generate a copy of any custom ML classifiers and regressors through 
-    the standard  `copy.deepcopy` method. This currently makes it incompatible with Keras and TensorFlow
-    predictors. This is a known limitation, which will be addressed in a future version.
-    
-The example below shows how to construct a `PrimalSolutionComponent` which internally uses
-sklearn's `KNeighborsClassifiers`. Any other sklearn classifier or pipeline can be used. 
+By default, given a training set of instantes, MIPLearn trains a fixed set of ML classifiers and regressors, then selects the best one based on cross-validation performance. Alternatively, the user may specify which ML model a component should use through the `classifier` or `regressor` contructor parameters. Scikit-learn classifiers and regressors are currently supported. A future version of the package will add compatibility with Keras models.
+
+The example below shows how to construct a `PrimalSolutionComponent` which internally uses scikit-learn's `KNeighborsClassifiers`. Any other scikit-learn classifier or pipeline can be used. The classifier needs to be provided as a lambda function because the component may need to create multiple copies of it. It needs to be wrapped in `ScikitLearnClassifier` to ensure that all the proper data transformations are applied.
 
 ```python
-from miplearn import PrimalSolutionComponent
+from miplearn import PrimalSolutionComponent, ScikitLearnClassifier
 from sklearn.neighbors import KNeighborsClassifier
 
-comp = PrimalSolutionComponent(classifier=KNeighborsClassifier(n_neighbors=5))
+comp = PrimalSolutionComponent(
+    classifier=lambda: ScikitLearnClassifier(
+        KNeighborsClassifier(n_neighbors=5),
+    ),
+)
 comp.fit(train_instances)
 ``` 
-  
\ No newline at end of file
diff --git a/miplearn/__init__.py b/miplearn/__init__.py
index 28003e7..3d8f052 100644
--- a/miplearn/__init__.py
+++ b/miplearn/__init__.py
@@ -3,7 +3,11 @@
 #  Released under the modified BSD license. See COPYING.md for more details.
 
 from .benchmark import BenchmarkRunner
-from .classifiers import Classifier, Regressor
+from .classifiers import (
+    Classifier,
+    Regressor,
+    ScikitLearnClassifier,
+)
 from .classifiers.adaptive import AdaptiveClassifier
 from .classifiers.threshold import MinPrecisionThreshold
 from .components.component import Component
diff --git a/miplearn/classifiers/__init__.py b/miplearn/classifiers/__init__.py
index f46c41d..265f644 100644
--- a/miplearn/classifiers/__init__.py
+++ b/miplearn/classifiers/__init__.py
@@ -3,7 +3,7 @@
 #  Released under the modified BSD license. See COPYING.md for more details.
 
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Optional, Any
 
 import numpy as np
 
@@ -14,7 +14,7 @@ class Classifier(ABC):
     data.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.n_features: Optional[int] = None
         self.n_classes: Optional[int] = None
 
@@ -77,7 +77,7 @@ class Regressor(ABC):
     values of other variables.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.n_inputs: Optional[int] = None
 
     @abstractmethod
@@ -128,3 +128,38 @@ class Regressor(ABC):
         (n_samples, n_inputs_x) = x_test.shape
         assert n_inputs_x == self.n_inputs
         return np.ndarray([])
+
+
+class ScikitLearnClassifier(Classifier):
+    """
+    Wrapper for ScikitLearn classifiers, which makes sure inputs and outputs have the
+    correct dimensions and types.
+    """
+
+    def __init__(self, clf: Any) -> None:
+        super().__init__()
+        self.inner_clf = clf
+
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        super().fit(x_train, y_train)
+        (n_samples, n_classes) = x_train.shape
+        assert n_classes == 2, "scikit-learn classifiers must have exactly two classes"
+        self.inner_clf.fit(x_train, y_train[:, 1])
+
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        super().predict_proba(x_test)
+        n_samples = x_test.shape[0]
+        sklearn_proba = self.inner_clf.predict_proba(x_test)
+        if isinstance(sklearn_proba, list):
+            assert len(sklearn_proba) == self.n_classes
+            for pb in sklearn_proba:
+                assert isinstance(pb, np.ndarray)
+                assert pb.dtype in [np.float16, np.float32, np.float64]
+                assert pb.shape == (n_samples, 2)
+            proba = np.hstack([pb[:, [1]] for pb in sklearn_proba])
+            assert proba.shape == (n_samples, self.n_classes)
+            return proba
+        else:
+            assert isinstance(sklearn_proba, np.ndarray)
+            assert sklearn_proba.shape == (n_samples, 2)
+            return sklearn_proba
diff --git a/miplearn/classifiers/adaptive.py b/miplearn/classifiers/adaptive.py
index 60b006e..de3cb0b 100644
--- a/miplearn/classifiers/adaptive.py
+++ b/miplearn/classifiers/adaptive.py
@@ -29,9 +29,6 @@ class AdaptiveClassifier(Classifier):
         candidates: Dict[str, Any] = None,
         evaluator: ClassifierEvaluator = ClassifierEvaluator(),
     ) -> None:
-        """
-        Initializes the meta-classifier.
-        """
         if candidates is None:
             candidates = {
                 "knn(100)": {
diff --git a/miplearn/classifiers/counting.py b/miplearn/classifiers/counting.py
index 0c8520a..226013b 100644
--- a/miplearn/classifiers/counting.py
+++ b/miplearn/classifiers/counting.py
@@ -1,6 +1,7 @@
 #  MIPLearn: Extensible Framework for Learning-Enhanced Mixed-Integer Optimization
 #  Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
 #  Released under the modified BSD license. See COPYING.md for more details.
+from typing import Optional, cast
 
 import numpy as np
 
@@ -9,20 +10,33 @@ from miplearn.classifiers import Classifier
 
 class CountingClassifier(Classifier):
     """
-    A classifier that generates constant predictions, based only on the
-    frequency of the training labels. For example, if y_train is [1.0, 0.0, 0.0]
-    this classifier always returns [0.66 0.33] for any x_test. It essentially
-    counts how many times each label appeared, hence the name.
+
+    A classifier that generates constant predictions, based only on the frequency of
+    the training labels. For example, suppose `y_train` is given by:
+    ```python
+    y_train = np.array([
+        [True, False],
+        [False, True],
+        [False, True],
+    ])
+    ```
+    Then `predict_proba` always returns `[0.33 0.66]` for every sample, regardless of
+    `x_train`. It essentially counts how many times each label appeared, hence the name.
+
     """
 
     def __init__(self) -> None:
-        self.mean = None
+        super().__init__()
+        self.mean: Optional[np.ndarray] = None
 
-    def fit(self, x_train, y_train):
-        self.mean = np.mean(y_train)
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        super().fit(x_train, y_train)
+        self.mean = cast(np.ndarray, np.mean(y_train, axis=0))
 
-    def predict_proba(self, x_test):
-        return np.array([[1 - self.mean, self.mean] for _ in range(x_test.shape[0])])
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        super().predict_proba(x_test)
+        n_samples = x_test.shape[0]
+        return np.array([self.mean for _ in range(n_samples)])
 
     def __repr__(self):
         return "CountingClassifier(mean=%s)" % self.mean
diff --git a/miplearn/classifiers/cv.py b/miplearn/classifiers/cv.py
index e8c8b42..2743457 100644
--- a/miplearn/classifiers/cv.py
+++ b/miplearn/classifiers/cv.py
@@ -3,14 +3,14 @@
 #  Released under the modified BSD license. See COPYING.md for more details.
 
 import logging
-from copy import deepcopy
+from typing import Optional, Callable, List
 
 import numpy as np
 from sklearn.dummy import DummyClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_val_score
 
-from miplearn.classifiers import Classifier
+from miplearn.classifiers import Classifier, ScikitLearnClassifier
 
 logger = logging.getLogger(__name__)
 
@@ -18,35 +18,58 @@ logger = logging.getLogger(__name__)
 class CrossValidatedClassifier(Classifier):
     """
     A meta-classifier that, upon training, evaluates the performance of another
-    classifier on the training data set using k-fold cross validation, then
-    either adopts the other classifier it if the cv-score is high enough, or
-    returns a constant label for every x_test otherwise.
+    candidate classifier on the training data set, using k-fold cross validation,
+    then either adopts it, if its cv-score is high enough, or returns constant
+    predictions for every x_test, otherwise.
 
-    The threshold is specified in comparison to a dummy classifier trained
-    on the same dataset. For example, a threshold of 0.0 indicates that any
-    classifier as good as the dummy predictor is acceptable. A threshold of 1.0
-    indicates that only classifier with a perfect cross-validation score are
-    acceptable. Other numbers are a linear interpolation of these two extremes.
+    Parameters
+    ----------
+    classifier: Callable[[], ScikitLearnClassifier]
+        A callable that constructs the candidate classifier.
+    threshold: float
+        Number from zero to one indicating how well must the candidate classifier
+        perform to be adopted. The threshold is specified in comparison to a dummy
+        classifier trained on the same dataset. For example, a threshold of 0.0
+        indicates that any classifier as good as the dummy predictor is acceptable. A
+        threshold of 1.0 indicates that only classifiers with perfect
+        cross-validation scores are acceptable. Other numbers are a linear
+        interpolation of these two extremes.
+    constant: Optional[List[bool]]
+        If the candidate classifier fails to meet the threshold, use a dummy classifier
+        which always returns this prediction instead. The list should have exactly as
+        many elements as the number of columns of `x_train` provided to `fit`.
+    cv: int
+        Number of folds.
+    scoring: str
+        Scoring function.
     """
 
     def __init__(
         self,
-        classifier=LogisticRegression(),
-        threshold=0.75,
-        constant=0.0,
-        cv=5,
-        scoring="accuracy",
+        classifier: Callable[[], ScikitLearnClassifier] = (
+            lambda: ScikitLearnClassifier(LogisticRegression())
+        ),
+        threshold: float = 0.75,
+        constant: Optional[List[bool]] = None,
+        cv: int = 5,
+        scoring: str = "accuracy",
     ):
+        """"""
         super().__init__()
-        self.classifier = None
-        self.classifier_prototype = classifier
-        self.constant = constant
+        if constant is None:
+            constant = [True, False]
+        self.n_classes = len(constant)
+        self.classifier: Optional[ScikitLearnClassifier] = None
+        self.classifier_factory = classifier
+        self.constant: List[bool] = constant
         self.threshold = threshold
         self.cv = cv
         self.scoring = scoring
 
-    def fit(self, x_train, y_train):
-        # super().fit(x_train, y_train)
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        super().fit(x_train, y_train)
+        (n_samples, n_classes) = x_train.shape
+        assert n_classes == self.n_classes
 
         # Calculate dummy score and absolute score threshold
         y_train_avg = np.average(y_train)
@@ -54,13 +77,20 @@ class CrossValidatedClassifier(Classifier):
         absolute_threshold = 1.0 * self.threshold + dummy_score * (1 - self.threshold)
 
         # Calculate cross validation score and decide which classifier to use
-        clf = deepcopy(self.classifier_prototype)
+        clf = self.classifier_factory()
+        assert clf is not None
+        assert isinstance(clf, ScikitLearnClassifier), (
+            f"The provided classifier callable must return a ScikitLearnClassifier. "
+            f"Found {clf.__class__.__name__} instead. If this is a scikit-learn "
+            f"classifier, you must wrap it with ScikitLearnClassifier."
+        )
+
         cv_score = float(
             np.mean(
                 cross_val_score(
-                    clf,
+                    clf.inner_clf,
                     x_train,
-                    y_train,
+                    y_train[:, 1],
                     cv=self.cv,
                     scoring=self.scoring,
                 )
@@ -77,14 +107,19 @@ class CrossValidatedClassifier(Classifier):
                 "cv_score is below threshold (%.2f < %.2f); discarding"
                 % (cv_score, absolute_threshold)
             )
-            self.classifier = DummyClassifier(
-                strategy="constant",
-                constant=self.constant,
+            self.classifier = ScikitLearnClassifier(
+                DummyClassifier(
+                    strategy="constant",
+                    constant=self.constant[1],
+                )
             )
 
         # Train chosen classifier
+        assert self.classifier is not None
+        assert isinstance(self.classifier, ScikitLearnClassifier)
         self.classifier.fit(x_train, y_train)
 
-    def predict_proba(self, x_test):
-        # super().predict_proba(x_test)
+    def predict_proba(self, x_test: np.ndarray) -> np.ndarray:
+        super().predict_proba(x_test)
+        assert self.classifier is not None
         return self.classifier.predict_proba(x_test)
diff --git a/tests/classifiers/test_counting.py b/tests/classifiers/test_counting.py
index a8bbec8..9082754 100644
--- a/tests/classifiers/test_counting.py
+++ b/tests/classifiers/test_counting.py
@@ -12,7 +12,27 @@ E = 0.1
 
 def test_counting():
     clf = CountingClassifier()
-    clf.fit(np.zeros((8, 25)), [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
-    expected_proba = np.array([[0.375, 0.625], [0.375, 0.625]])
-    actual_proba = clf.predict_proba(np.zeros((2, 25)))
-    assert norm(actual_proba - expected_proba) < E
+    n_features = 25
+    x_train = np.zeros((8, n_features))
+    y_train = np.array(
+        [
+            [True, False, False],
+            [True, False, False],
+            [False, True, False],
+            [True, False, False],
+            [False, True, False],
+            [False, True, False],
+            [False, True, False],
+            [False, False, True],
+        ]
+    )
+    x_test = np.zeros((2, n_features))
+    y_expected = np.array(
+        [
+            [3 / 8.0, 4 / 8.0, 1 / 8.0],
+            [3 / 8.0, 4 / 8.0, 1 / 8.0],
+        ]
+    )
+    clf.fit(x_train, y_train)
+    y_actual = clf.predict_proba(x_test)
+    assert norm(y_actual - y_expected) < E
diff --git a/tests/classifiers/test_cv.py b/tests/classifiers/test_cv.py
index fa6baa6..4ba8f9b 100644
--- a/tests/classifiers/test_cv.py
+++ b/tests/classifiers/test_cv.py
@@ -7,20 +7,37 @@ from numpy.linalg import norm
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
 
+from miplearn.classifiers import ScikitLearnClassifier
 from miplearn.classifiers.cv import CrossValidatedClassifier
 
 E = 0.1
 
 
-def test_cv():
+def test_cv() -> None:
     # Training set: label is true if point is inside a 2D circle
-    x_train = np.array([[x1, x2] for x1 in range(-10, 11) for x2 in range(-10, 11)])
+    x_train = np.array(
+        [
+            [
+                x1,
+                x2,
+            ]
+            for x1 in range(-10, 11)
+            for x2 in range(-10, 11)
+        ]
+    )
     x_train = StandardScaler().fit_transform(x_train)
     n_samples = x_train.shape[0]
-
     y_train = np.array(
         [
-            1.0 if x1 * x1 + x2 * x2 <= 100 else 0.0
+            [
+                False,
+                True,
+            ]
+            if x1 * x1 + x2 * x2 <= 100
+            else [
+                True,
+                False,
+            ]
             for x1 in range(-10, 11)
             for x2 in range(-10, 11)
         ]
@@ -29,24 +46,39 @@ def test_cv():
     # Support vector machines with linear kernels do not perform well on this
     # data set, so predictor should return the given constant.
     clf = CrossValidatedClassifier(
-        classifier=SVC(probability=True, random_state=42),
+        classifier=lambda: ScikitLearnClassifier(
+            SVC(
+                probability=True,
+                random_state=42,
+            )
+        ),
         threshold=0.90,
-        constant=0.0,
+        constant=[True, False],
         cv=30,
     )
     clf.fit(x_train, y_train)
     proba = clf.predict_proba(x_train)
+    assert isinstance(proba, np.ndarray)
+    assert proba.shape == (n_samples, 2)
+
     y_pred = (proba[:, 1] > 0.5).astype(float)
     assert norm(np.zeros(n_samples) - y_pred) < E
 
     # Support vector machines with quadratic kernels perform almost perfectly
     # on this data set, so predictor should return their prediction.
     clf = CrossValidatedClassifier(
-        classifier=SVC(probability=True, kernel="poly", degree=2, random_state=42),
+        classifier=lambda: ScikitLearnClassifier(
+            SVC(
+                probability=True,
+                kernel="poly",
+                degree=2,
+                random_state=42,
+            )
+        ),
         threshold=0.90,
         cv=30,
     )
     clf.fit(x_train, y_train)
     proba = clf.predict_proba(x_train)
     y_pred = (proba[:, 1] > 0.5).astype(float)
-    assert norm(y_train - y_pred) < E
+    assert norm(y_train[:, 1] - y_pred) < E