diff --git a/Makefile b/Makefile
index aeb5739..1341104 100644
--- a/Makefile
+++ b/Makefile
@@ -20,10 +20,9 @@ dist-upload:
 	$(PYTHON) -m twine upload dist/*
 
 docs:
-	rm -rf ../docs/$(VERSION) html
-	mkdocs build -d ../docs/$(VERSION)/
-	pdoc3 --html miplearn
-	mv -v html ../docs/$(VERSION)/api
+	rm -rf ../docs/$(VERSION) 
+	cd docs; make clean; make dirhtml
+	rsync -avP --delete-after docs/_build/dirhtml/ ../docs/$(VERSION)
 
 
 install-deps:
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..c987782
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,14 @@
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
new file mode 100644
index 0000000..d484dec
--- /dev/null
+++ b/docs/_static/custom.css
@@ -0,0 +1,7 @@
+h1.site-logo {
+    font-size: 30px !important;
+}
+
+h1.site-logo small {
+    font-size: 20px !important;
+}
\ No newline at end of file
diff --git a/docs/about.md b/docs/about.md
index 0e862af..fb98131 100644
--- a/docs/about.md
+++ b/docs/about.md
@@ -1,15 +1,23 @@
+```{sectnum}
+---
+start: 4
+depth: 2
+suffix: .
+---
+```
+
 # About
 
-### Authors
+## Authors
 
 * **Alinson S. Xavier,** Argonne National Laboratory <<axavier@anl.gov>>
 * **Feng Qiu,** Argonne National Laboratory <<fqiu@anl.gov>>
 
-### Acknowledgments
+## Acknowledgments
 
 * Based upon work supported by **Laboratory Directed Research and Development** (LDRD) funding from Argonne National Laboratory, provided by the Director, Office of Science, of the U.S. Department of Energy under Contract No. DE-AC02-06CH11357, and the **U.S. Department of Energy Advanced Grid Modeling Program** under Grant DE-OE0000875.
 
-### References
+## References
 
 
 If you use MIPLearn in your research, or the included problem generators, we kindly request that you cite the package as follows:
@@ -20,7 +28,7 @@ If you use MIPLearn in the field of power systems optimization, we kindly reques
 
 * **Alinson S. Xavier, Feng Qiu, Shabbir Ahmed.** *Learning to Solve Large-Scale Unit Commitment Problems.* INFORMS Journal on Computing (2020). DOI: [10.1287/ijoc.2020.0976](https://doi.org/10.1287/ijoc.2020.0976)
 
-### License
+## License
 
 ```text
 MIPLearn, an extensible framework for Learning-Enhanced Mixed-Integer Optimization
diff --git a/docs/problems.md b/docs/benchmark.md
similarity index 99%
rename from docs/problems.md
rename to docs/benchmark.md
index 954aa6d..88b6db5 100644
--- a/docs/problems.md
+++ b/docs/benchmark.md
@@ -1,4 +1,12 @@
-# Benchmark Problems, Challenges and Results
+```{sectnum}
+---
+start: 2
+depth: 2
+suffix: .
+---
+```
+
+# Benchmarks
 
 MIPLearn provides a selection of benchmark problems and random instance generators, covering applications from different fields, that can be used to evaluate new learning-enhanced MIP techniques in a measurable and reproducible way. In this page, we describe these problems, the included instance generators, and we present some benchmark results for  `LearningSolver` with default parameters.
 
@@ -100,6 +108,7 @@ TravelingSalesmanGenerator(x=uniform(loc=0.0, scale=1000.0),
 
 Given a set of $n$ items and $m$ types of resources (also called *knapsacks*), the problem is to find a subset of items that maximizes profit without consuming more resources than it is available. More precisely, the problem is:
 
+$$
 \begin{align*}
     \text{maximize}
         & \sum_{j=1}^n p_j x_j
@@ -110,6 +119,7 @@ Given a set of $n$ items and $m$ types of resources (also called *knapsacks*), t
     & x_j \in \{0,1\}
         & \forall j=1,\ldots,n
 \end{align*}
+$$
 
 ### Random instance generator
 
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..630490d
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,16 @@
+project = "MIPLearn"
+copyright = "2020-2021, UChicago Argonne, LLC"
+author = ""
+release = "0.2.0"
+extensions = ["myst_parser"]
+templates_path = ["_templates"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+html_theme = "sphinx_book_theme"
+html_static_path = ["_static"]
+html_css_files = ["custom.css"]
+html_theme_options = {
+    "repository_url": "https://github.com/ANL-CEEESA/MIPLearn/",
+    "use_repository_button": True,
+    "extra_navbar": "",
+}
+html_title = f"MIPLearn<br/><small>{release}</small>"
diff --git a/docs/css/custom.css b/docs/css/custom.css
deleted file mode 100644
index babb687..0000000
--- a/docs/css/custom.css
+++ /dev/null
@@ -1,32 +0,0 @@
-.navbar-default {
-  border-bottom: 0px;
-  background-color: #fff;
-  box-shadow: 0px 0px 15px rgba(0, 0, 0, 0.2);
-}
-
-a, .navbar-default a {
-  color: #06a !important;
-  font-weight: normal;
-}
-
-.disabled > a {
-  color: #999 !important;
-}
-
-.navbar-default a:hover,
-.navbar-default a:focus {
-  background-color: #f4f4f4 !important;
-}
-
-.navbar-default .active,
-.active > a {
-  background-color: #f0f0f0 !important;
-}
-
-.icon-bar {
-  background-color: #666 !important;
-}
-
-.navbar-collapse {
-  border-color: #fff !important;
-}
\ No newline at end of file
diff --git a/docs/customization.md b/docs/customization.md
index ef260f9..cc087fc 100644
--- a/docs/customization.md
+++ b/docs/customization.md
@@ -1,3 +1,11 @@
+```{sectnum}
+---
+start: 3
+depth: 2
+suffix: .
+---
+```
+
 # Customization
 
 ## Customizing solver parameters
diff --git a/docs/figures/benchmark_knapsack_a.png b/docs/figures/benchmark_knapsack_a.png
deleted file mode 120000
index a88935f..0000000
--- a/docs/figures/benchmark_knapsack_a.png
+++ /dev/null
@@ -1 +0,0 @@
-../../benchmark/knapsack/ChallengeA/performance.png
\ No newline at end of file
diff --git a/docs/figures/benchmark_stab_a.png b/docs/figures/benchmark_stab_a.png
deleted file mode 120000
index 073c5ec..0000000
--- a/docs/figures/benchmark_stab_a.png
+++ /dev/null
@@ -1 +0,0 @@
-../../benchmark/stab/ChallengeA/performance.png
\ No newline at end of file
diff --git a/docs/figures/benchmark_tsp_a.png b/docs/figures/benchmark_tsp_a.png
deleted file mode 120000
index 31a74c2..0000000
--- a/docs/figures/benchmark_tsp_a.png
+++ /dev/null
@@ -1 +0,0 @@
-../../benchmark/tsp/ChallengeA/performance.png
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index 6552324..04c428f 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,9 +5,9 @@
 Unlike pure ML methods, MIPLearn is not only able to find high-quality solutions to discrete optimization problems, but it can also prove the optimality and feasibility of these solutions.
 Unlike conventional MIP solvers, MIPLearn can take full advantage of very specific observations that happen to be true in a particular family of instances (such as the observation that a particular constraint is typically redundant, or that a particular variable typically assumes a certain value). 
 
-For certain classes of problems, this approach has been shown to provide significant performance benefits (see [benchmarks](problems.md) and [references](about.md)).
+For certain classes of problems, this approach has been shown to provide significant performance benefits (see [benchmarks](benchmark.md) and [references](about.md)).
 
-### Features
+## Features
 
 * **MIPLearn proposes a flexible problem specification format,** which allows users to describe their particular optimization problems to a Learning-Enhanced MIP solver, both from the MIP perspective and from the ML perspective, without making any assumptions on the problem being modeled, the mathematical formulation of the problem, or ML encoding. While the format is very flexible, some constraints are enforced to ensure that it is usable by an actual solver.
 
@@ -17,14 +17,19 @@ For certain classes of problems, this approach has been shown to provide signifi
 
 * **MIPLearn is customizable and extensible**. For MIP and ML researchers exploring new techniques to accelerate MIP performance based on historical data, each component of the reference solver can be individually replaced, extended or customized.
 
-### Documentation
+## Site contents
 
-* [Installation and typical usage](usage.md)
-* [Benchmark problems and results](problems.md)
-* [Customizing the solver](customization.md)
-* [License, authors, references and acknowledgments](about.md)
+```{toctree}
+---
+maxdepth: 2
+---
+usage.md
+benchmark.md
+customization.md
+about.md
+```
 
-### Source Code
+## Source Code
 
 * [https://github.com/ANL-CEEESA/MIPLearn](https://github.com/ANL-CEEESA/MIPLearn)
 
diff --git a/docs/js/mathjax.js b/docs/js/mathjax.js
deleted file mode 100644
index bfc06b8..0000000
--- a/docs/js/mathjax.js
+++ /dev/null
@@ -1,8 +0,0 @@
-MathJax.Hub.Config({
-  "tex2jax": { inlineMath: [ [ '$', '$' ] ] }
-});
-MathJax.Hub.Config({
-  config: ["MMLorHTML.js"],
-  jax: ["input/TeX", "output/HTML-CSS", "output/NativeMML"],
-  extensions: ["MathMenu.js", "MathZoom.js"]
-});
\ No newline at end of file
diff --git a/docs/usage.md b/docs/usage.md
index 93dab8c..9f761f4 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -1,6 +1,14 @@
-# Usage
+```{sectnum}
+---
+start: 1
+depth: 2
+suffix: .
+---
+```
+
+# Using MIPLearn
 
-## 1. Installation
+## Installation
 
 In these docs, we describe the Python/Pyomo version of the package, although a [Julia/JuMP version](https://github.com/ANL-CEEESA/MIPLearn.jl) is also available. A mixed-integer solver is also required and its Python bindings must be properly installed. Supported solvers are currently CPLEX, Gurobi and XPRESS.
 
@@ -17,7 +25,7 @@ as follows:
 import miplearn
 ```
 
-## 2. Using `LearningSolver`
+## Using `LearningSolver`
 
 The main class provided by this package is `LearningSolver`, a learning-enhanced MIP solver which uses information from previously solved instances to accelerate the solution of new instances. The following example shows its basic usage:
 
@@ -46,7 +54,7 @@ for instance in test_instances:
 In this example, we have two lists of user-provided instances: `training_instances` and `test_instances`. We start by solving all training instances. Since there is no historical information available at this point, the instances will be processed from scratch, with no ML acceleration. After solving each instance, the solver stores within each `instance` object the optimal solution, the optimal objective value, and other information that can be used to accelerate future solves. After all training instances are solved, we call `solver.fit(training_instances)`. This instructs the solver to train all its internal machine-learning models based on the solutions of the (solved) trained instances. Subsequent calls to `solver.solve(instance)` will automatically use the trained Machine Learning models to accelerate the solution process.
 
 
-## 3. Describing problem instances
+## Describing problem instances
 
 Instances to be solved by `LearningSolver` must derive from the abstract class `miplearn.Instance`. The following three abstract methods must be implemented:
 
@@ -61,13 +69,13 @@ An optional method which can be implemented is `instance.get_variable_category(v
 It is not necessary to have a one-to-one correspondence between features and problem instances. One important (and deliberate) limitation of MIPLearn, however, is that `get_instance_features()` must always return arrays of same length for all relevant instances of the problem. Similarly, `get_variable_features(var_name, index)` must also always return arrays of same length for all variables in each category. It is up to the user to decide how to encode variable-length characteristics of the problem into fixed-length vectors. In graph problems, for example, graph embeddings can be used to reduce the (variable-length) lists of nodes and edges into a fixed-length structure that still preserves some properties of the graph. Different instance encodings may have significant impact on performance.
 
 
-## 4. Describing lazy constraints
+## Describing lazy constraints
 
 For many MIP formulations, it is not desirable to add all constraints up-front, either because the total number of constraints is very large, or because some of the constraints, even in relatively small numbers, can still cause significant performance impact when added to the formulation. In these situations, it may be desirable to generate and add constraints incrementaly, during the solution process itself. Conventional MIP solvers typically start by solving the problem without any lazy constraints. Whenever a candidate solution is found, the solver finds all violated lazy constraints and adds them to the formulation. MIPLearn significantly accelerates this process by using ML to predict which lazy constraints should be enforced from the very beginning of the optimization process, even before a candidate solution is available.
 
 MIPLearn supports two types of lazy constraints: through constraint annotations and through callbacks.
 
-### 4.1 Adding lazy constraints through annotations
+###  Adding lazy constraints through annotations
 
 The easiest way to create lazy constraints in MIPLearn is to add them to the model (just like any regular constraints), then annotate them as lazy, as described below. Just before the optimization starts, MIPLearn removes all lazy constraints from the model and places them in a lazy constraint pool. If any trained ML models are available, MIPLearn queries these models to decide which of these constraints should be moved back into the formulation. After this step, the optimization starts, and lazy constraints from the pool are added to the model in the conventional fashion.
 
@@ -84,7 +92,7 @@ An additional method that can be implemented is `get_lazy_constraint_category(ci
 !!! warning
     If two lazy constraints belong to the same category, their feature vectors should have the same length.
 
-### 4.2 Adding lazy constraints through callbacks
+### Adding lazy constraints through callbacks
 
 Although convenient, the method described in the previous subsection still requires the generation of all lazy constraints ahead of time, which can be prohibitively expensive. An alternative method is through a lazy constraint callbacks, described below. During the solution process, MIPLearn will repeatedly call a user-provided function to identify any violated lazy constraints. If violated constraints are identified, MIPLearn will additionally call another user-provided function to generate the constraint and add it to the formulation.
 
@@ -101,23 +109,24 @@ Assuming that trained ML models are available, immediately after calling `solver
 
 After the optimization process starts, MIPLearn will periodically call `find_violated_lazy_constraints` to verify if the current solution violates any lazy constraints. If any violated lazy constraints are found, MIPLearn will call the method `build_violated_lazy_constraints` and add the returned constraints to the formulation.
 
-!!! note
-    When implementing `find_violated_lazy_constraints(self, model)`, the current solution may be accessed through `self.solution[var_name][index]`.
-
+```{tip}
+When implementing `find_violated_lazy_constraints(self, model)`, the current solution may be accessed through `self.solution[var_name][index]`.
+```
 
-## 5. Obtaining heuristic solutions
+## Obtaining heuristic solutions
 
 By default, `LearningSolver` uses Machine Learning to accelerate the MIP solution process, while maintaining all optimality guarantees provided by the MIP solver. In the default mode of operation, for example, predicted optimal solutions are used only as MIP starts.
 
-For more significant performance benefits, `LearningSolver` can also be configured to place additional trust in the Machine Learning predictors, by using the `mode="heuristic"` constructor argument. When operating in this mode, if a ML model is statistically shown (through *stratified k-fold cross validation*) to have exceptionally high accuracy, the solver may decide to restrict the search space based on its predictions. The parts of the solution which the ML models cannot predict accurately will still be explored using traditional (branch-and-bound) methods.  For particular applications, this mode has been shown to quickly produce optimal or near-optimal solutions (see [references](about.md#references) and [benchmark results](problems.md)).
+For more significant performance benefits, `LearningSolver` can also be configured to place additional trust in the Machine Learning predictors, by using the `mode="heuristic"` constructor argument. When operating in this mode, if a ML model is statistically shown (through *stratified k-fold cross validation*) to have exceptionally high accuracy, the solver may decide to restrict the search space based on its predictions. The parts of the solution which the ML models cannot predict accurately will still be explored using traditional (branch-and-bound) methods.  For particular applications, this mode has been shown to quickly produce optimal or near-optimal solutions (see [references](about.md#references) and [benchmark results](benchmark.md)).
 
 
-!!! danger
-    The `heuristic` mode provides no optimality guarantees, and therefore should only be used if the solver is first trained on a large and representative set of training instances. Training on a small or non-representative set of instances may produce low-quality solutions, or make the solver incorrectly classify new instances as infeasible.
+```{danger}
+The `heuristic` mode provides no optimality guarantees, and therefore should only be used if the solver is first trained on a large and representative set of training instances. Training on a small or non-representative set of instances may produce low-quality solutions, or make the solver incorrectly classify new instances as infeasible.
+```
 
-## 6. Scaling Up
+## Scaling Up
 
-### 6.1 Saving and loading solver state
+### Saving and loading solver state
 
 After solving a large number of training instances, it may be desirable to save the current state of `LearningSolver` to disk, so that the solver can still use the acquired knowledge after the application restarts. This can be accomplished by using the the utility functions `write_pickle_gz` and `read_pickle_gz`, as the following example illustrates:
 
@@ -148,7 +157,7 @@ for instance in test_instances:
 ```
 
 
-### 6.2 Solving instances in parallel
+### Solving instances in parallel
 
 In many situations, instances can be solved in parallel to accelerate the training process. `LearningSolver` provides the method `parallel_solve(instances)` to easily achieve this:
 
@@ -166,7 +175,7 @@ solver.parallel_solve(test_instances)
 ```
 
 
-### 6.3 Solving instances from the disk
+### Solving instances from the disk
 
 In all examples above, we have assumed that instances are available as Python objects, stored in memory. When problem instances are very large, or when there is a large number of problem instances, this approach may require an excessive amount of memory. To reduce memory requirements, MIPLearn can also operate on instances that are stored on disk, through the `PickleGzInstance` class, as the next example illustrates.
 
@@ -203,7 +212,7 @@ solver.parallel_solve(test_instances, n_jobs=4)
 
 By default, `solve` and `parallel_solve` modify files in place. That is, after the instances are loaded from disk and solved, MIPLearn writes them back to the disk, overwriting the original files. To discard the modifications instead, use `LearningSolver(..., discard_outputs=True)`. This can be useful, for example, during benchmarks.
 
-## 7. Running benchmarks
+## Running benchmarks
 
 MIPLearn provides the utility class `BenchmarkRunner`, which simplifies the task of comparing the performance of different solvers. The snippet below shows its basic usage:
 
@@ -232,6 +241,6 @@ benchmark.write_csv("results.csv")
 
 The method `fit` trains the ML models for each individual solver. The method `parallel_solve` solves the test instances in parallel, and collects solver statistics such as running time and optimal value. Finally, `write_csv` produces a table of results. The columns in the CSV file depend on the components added to the solver.
 
-## 8. Current Limitations
+## Current Limitations
 
 * Only binary and continuous decision variables are currently supported. General integer variables are not currently supported by some solver components.
diff --git a/mkdocs.yml b/mkdocs.yml
deleted file mode 100644
index df6410f..0000000
--- a/mkdocs.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-site_name: MIPLearn
-theme: cinder
-copyright: "Copyright © 2020, UChicago Argonne, LLC. All Rights Reserved."
-repo_url: https://github.com/ANL-CEEESA/MIPLearn
-edit_uri: edit/dev/docs/
-nav:
-    - Home: index.md
-    - Usage: usage.md
-    - Problems: problems.md
-    - Customization: customization.md
-    - About: about.md
-    - API: api/miplearn/index.html
-plugins:
-  - search
-markdown_extensions:
-  - admonition  
-  - mdx_math  
-extra_javascript: 
-    - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML
-    - js/mathjax.js
-docs_dir: docs
-site_dir: build/docs
-extra_css:
-    - "css/custom.css"
diff --git a/requirements.txt b/requirements.txt
index b7a3c3f..aefbcb6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
--e .[build]
+-e .[dev]
diff --git a/setup.py b/setup.py
index a5ff3b4..a25ffee 100644
--- a/setup.py
+++ b/setup.py
@@ -37,12 +37,13 @@ setup(
     extras_require={
         "dev": [
             "docopt>=0.6,<0.7",
-            "mkdocs>=1,<2",
-            "mkdocs-cinder>=1,<2",
             "black==20.8b1",
             "pre-commit>=2,<3",
             "pdoc3>=0.7,<0.8",
             "twine>=3,<4",
+            "Sphinx>=3,<4",
+            "sphinx-book-theme==0.1.0",
+            "myst-parser==0.14.0",
         ]
     },
 )