From bb263ee9dd43c3cacce93cd688647550492a107e Mon Sep 17 00:00:00 2001
From: Eli <eli@elibingham.com>
Date: Thu, 15 Feb 2024 13:29:17 -0500
Subject: [PATCH 1/6] Add a simple PyTorch training example

---
 examples/svi_torch.py         | 108 ++++++++++++++++++++++++++++++++++
 tutorial/source/index.rst     |   4 +-
 tutorial/source/svi_torch.rst |  15 +++++
 3 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 examples/svi_torch.py
 create mode 100644 tutorial/source/svi_torch.rst

diff --git a/examples/svi_torch.py b/examples/svi_torch.py
new file mode 100644
index 0000000000..fb05130105
--- /dev/null
+++ b/examples/svi_torch.py
@@ -0,0 +1,108 @@
+# Copyright Contributors to the Pyro project.
+# SPDX-License-Identifier: Apache-2.0
+
+# Using vanilla PyTorch to perform optimization in SVI.
+#
+# This tutorial demonstrates how to use standard PyTorch optimizers, dataloaders and training loops
+# to perform optimization in SVI. This is useful when you want to use custom optimizers,
+# learning rate schedules, dataloaders, or other advanced training techniques,
+# or just to simplify integration with other elements of the PyTorch ecosystem.
+
+from typing import Callable
+import argparse
+
+import torch
+
+import pyro
+import pyro.distributions as dist
+from pyro.infer import Trace_ELBO
+from pyro.infer.autoguide import AutoNormal
+from pyro.nn import PyroModule
+
+
+# We define a model as usual. This model is data parallel and supports subsampling.
+class Model(PyroModule):
+    def __init__(self, size):
+        super().__init__()
+        self.size = size
+        # We register a buffer for a constant scalar tensor to represent zero.
+        # This is useful for making priors that do not depend on inputs
+        # or learnable parameters compatible with the Module.to() method
+        # for setting the device or dtype of a module and its parameters.
+        self.register_buffer("zero", torch.tensor(0.0))
+
+    def forward(self, covariates, data=None):
+        # Sample parameters from priors that make use of the zero buffer trick
+        coeff = pyro.sample("coeff", dist.Normal(self.zero, 1))
+        bias = pyro.sample("bias", dist.Normal(self.zero, 1))
+        scale = pyro.sample("scale", dist.LogNormal(self.zero, 1))
+
+        # Since we'll use a PyTorch dataloader during training, we need to
+        # manually pass minibatches of (covariates,data) that are smaller than
+        # the full self.size, rather than relying on pyro.plate to automatically subsample.
+        with pyro.plate("data", self.size, len(covariates)):
+            loc = bias + coeff * covariates
+            return pyro.sample("obs", dist.Normal(loc, scale), obs=data)
+
+
+def main(args):
+    # Make PyroModule parameters local (like ordinary torch.nn.Parameters),
+    # rather than shared by name through Pyro's global parameter store.
+    # This is highly recommended whenever models can be written without pyro.param().
+    pyro.settings.set(module_local_params=True)
+
+    # set seed for reproducibility
+    pyro.set_rng_seed(args.seed)
+
+    # Create a synthetic dataset from a randomly initialized model.
+    with torch.no_grad():
+        covariates = torch.randn(args.size)
+        data = Model(args.size)(covariates)
+
+    # Create a model and a guide, both as (Pyro)Modules.
+    model: torch.nn.Module = Model(args.size)
+    guide: torch.nn.Module = AutoNormal(model)
+
+    # Create a loss function as a Module that includes model and guide parameters.
+    # All Pyro ELBO estimators can be __call__()ed with a model and guide pair as arguments
+    # to return a loss function Module that takes the same arguments as the model and guide
+    # and exposes all of their torch.nn.Parameters and pyro.nn.PyroParam parameters.
+    elbo: Callable[[torch.nn.Module, torch.nn.Module], torch.nn.Module] = Trace_ELBO()
+    loss_fn: torch.nn.Module = elbo(model, guide)
+
+    # Create a dataloader.
+    dataset = torch.utils.data.TensorDataset(covariates, data)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size)
+
+    # All relevant parameters need to be initialized before an optimizer can be created.
+    # Since we used AutoNormal guide our parameters have not be initialized yet.
+    # Therefore we initialize the model and guide by running one mini-batch through the loss.
+    mini_batch = dataset[: args.batch_size]
+    loss_fn(*mini_batch)
+
+    # Create a PyTorch optimizer for the parameters of the model and guide in loss_fn.
+    optimizer = torch.optim.Adam(loss_fn.parameters(), lr=args.learning_rate)
+
+    # Run stochastic variational inference using PyTorch optimizers from torch.optim
+    for epoch in range(args.max_epochs):
+        for batch in dataloader:
+            optimizer.zero_grad()
+            loss = loss_fn(*batch)
+            loss.backward()
+            optimizer.step()
+        print(f"epoch {epoch} loss = {loss}")
+
+
+if __name__ == "__main__":
+    assert pyro.__version__.startswith("1.8.6")
+    parser = argparse.ArgumentParser(
+        description="Using vanilla PyTorch to perform optimization in SVI"
+    )
+    parser.add_argument("--size", default=10000, type=int)
+    parser.add_argument("--batch_size", default=100, type=int)
+    parser.add_argument("--learning_rate", default=0.01, type=float)
+    parser.add_argument("--seed", default=20200723, type=int)
+    # pl.Trainer arguments.
+    parser.add_argument("--max_epochs", default=10, type=int)
+    args = parser.parse_args()
+    main(args)
diff --git a/tutorial/source/index.rst b/tutorial/source/index.rst
index 442cb74878..1320c63283 100644
--- a/tutorial/source/index.rst
+++ b/tutorial/source/index.rst
@@ -22,7 +22,8 @@ and look carefully through the series :ref:`practical-pyro-and-pytorch`,
 especially the :doc:`first Bayesian regression tutorial <bayesian_regression>`.
 This tutorial goes step-by-step through solving a simple Bayesian machine learning problem with Pyro,
 grounding the concepts from the introductory tutorials in runnable code.
-Industry users interested in serving predictions from a trained model in C++ should also read :doc:`the PyroModule tutorial <modules>`.
+Users interested in integrating with existing PyTorch training and serving infrastructure or serving predictions from a trained model in C++ should also read :doc:`the PyroModule tutorial <modules>`
+and look at the :doc:`SVI with PyTorch <svi_torch>` and :doc:`SVI with Lightning <svi_lightning>` examples.
 
 Most users who reach this point will also find our :doc:`guide to tensor shapes in Pyro <tensor_shapes>` essential reading.
 Pyro makes extensive use of the behavior of `"array broadcasting" <https://numpy.org/doc/stable/user/basics.broadcasting.html>`_
@@ -95,6 +96,7 @@ List of Tutorials
    workflow
    prior_predictive
    jit
+   svi_torch
    svi_horovod
    svi_lightning
    svi_flow_guide
diff --git a/tutorial/source/svi_torch.rst b/tutorial/source/svi_torch.rst
new file mode 100644
index 0000000000..f916d41e4e
--- /dev/null
+++ b/tutorial/source/svi_torch.rst
@@ -0,0 +1,15 @@
+Example: using vanilla PyTorch to perform optimization in SVI
+=============================================================
+
+This script uses argparse arguments to construct PyTorch optimizer and dataloader, for example::
+
+    $ python examples/svi_torch.py --size 10000 --batch_size 100 --max_epochs 100
+
+`View svi_torch.py on github`__
+
+.. _github: https://github.com/pyro-ppl/pyro/blob/dev/examples/svi_torch.py
+
+__ github_
+
+.. literalinclude:: ../../examples/svi_torch.py
+    :language: python

From af4af909b4c8e1ef31962c8af5065449a98b3293 Mon Sep 17 00:00:00 2001
From: Eli <eli@elibingham.com>
Date: Thu, 15 Feb 2024 13:33:40 -0500
Subject: [PATCH 2/6] isort

---
 examples/svi_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/svi_torch.py b/examples/svi_torch.py
index fb05130105..de756286ee 100644
--- a/examples/svi_torch.py
+++ b/examples/svi_torch.py
@@ -8,8 +8,8 @@
 # learning rate schedules, dataloaders, or other advanced training techniques,
 # or just to simplify integration with other elements of the PyTorch ecosystem.
 
-from typing import Callable
 import argparse
+from typing import Callable
 
 import torch
 

From 5839685253f9fd765aa2c897a9de34899b1dc92a Mon Sep 17 00:00:00 2001
From: Eli <eli@elibingham.com>
Date: Thu, 15 Feb 2024 13:37:07 -0500
Subject: [PATCH 3/6] tweak intro

---
 tutorial/source/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorial/source/index.rst b/tutorial/source/index.rst
index 1320c63283..0c4f0153c2 100644
--- a/tutorial/source/index.rst
+++ b/tutorial/source/index.rst
@@ -22,7 +22,7 @@ and look carefully through the series :ref:`practical-pyro-and-pytorch`,
 especially the :doc:`first Bayesian regression tutorial <bayesian_regression>`.
 This tutorial goes step-by-step through solving a simple Bayesian machine learning problem with Pyro,
 grounding the concepts from the introductory tutorials in runnable code.
-Users interested in integrating with existing PyTorch training and serving infrastructure or serving predictions from a trained model in C++ should also read :doc:`the PyroModule tutorial <modules>`
+Users interested in integrating with existing PyTorch training and serving infrastructure should also read :doc:`the PyroModule tutorial <modules>`
 and look at the :doc:`SVI with PyTorch <svi_torch>` and :doc:`SVI with Lightning <svi_lightning>` examples.
 
 Most users who reach this point will also find our :doc:`guide to tensor shapes in Pyro <tensor_shapes>` essential reading.

From 56bc6629518e23b5244f9d4964c773957098dca7 Mon Sep 17 00:00:00 2001
From: Eli <eli@elibingham.com>
Date: Thu, 15 Feb 2024 13:45:56 -0500
Subject: [PATCH 4/6] add cuda and add to test_examples

---
 examples/svi_torch.py         | 7 ++++---
 tests/test_examples.py        | 2 ++
 tutorial/source/svi_torch.rst | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/svi_torch.py b/examples/svi_torch.py
index de756286ee..f791dac24c 100644
--- a/examples/svi_torch.py
+++ b/examples/svi_torch.py
@@ -69,6 +69,7 @@ def main(args):
     # and exposes all of their torch.nn.Parameters and pyro.nn.PyroParam parameters.
     elbo: Callable[[torch.nn.Module, torch.nn.Module], torch.nn.Module] = Trace_ELBO()
     loss_fn: torch.nn.Module = elbo(model, guide)
+    loss_fn.to(device=torch.device("cuda" if args.cuda else "cpu"))
 
     # Create a dataloader.
     dataset = torch.utils.data.TensorDataset(covariates, data)
@@ -84,7 +85,7 @@ def main(args):
     optimizer = torch.optim.Adam(loss_fn.parameters(), lr=args.learning_rate)
 
     # Run stochastic variational inference using PyTorch optimizers from torch.optim
-    for epoch in range(args.max_epochs):
+    for epoch in range(args.num_epochs):
         for batch in dataloader:
             optimizer.zero_grad()
             loss = loss_fn(*batch)
@@ -102,7 +103,7 @@ def main(args):
     parser.add_argument("--batch_size", default=100, type=int)
     parser.add_argument("--learning_rate", default=0.01, type=float)
     parser.add_argument("--seed", default=20200723, type=int)
-    # pl.Trainer arguments.
-    parser.add_argument("--max_epochs", default=10, type=int)
+    parser.add_argument("--num_epochs", default=10, type=int)
+    parser.add_argument("--cuda", action="store_true", default=False)
     args = parser.parse_args()
     main(args)
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 8e62a7f770..731665cd72 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -110,6 +110,7 @@
     "sparse_gamma_def.py --num-epochs=2 --eval-particles=2 --eval-frequency=1 --guide custom",
     "sparse_gamma_def.py --num-epochs=2 --eval-particles=2 --eval-frequency=1 --guide auto",
     "sparse_gamma_def.py --num-epochs=2 --eval-particles=2 --eval-frequency=1 --guide easy",
+    "svi_torch.py --num-epochs=2 --size=400",
     "svi_horovod.py --num-epochs=2 --size=400 --no-horovod",
     pytest.param(
         "svi_lightning.py --max_epochs=2 --size=400 --accelerator cpu --devices 1",
@@ -181,6 +182,7 @@
     "sir_hmc.py -t=2 -w=2 -n=4 -d=2 -m=1 --enum --cuda",
     "sir_hmc.py -t=2 -w=2 -n=4 -d=2 -p=10000 --sequential --cuda",
     "sir_hmc.py -t=2 -w=2 -n=4 -d=100 -p=10000 --cuda",
+    "svi_torch.py --num-epochs=2 --size=400 --cuda",
     "svi_horovod.py --num-epochs=2 --size=400 --cuda --no-horovod",
     pytest.param(
         "svi_lightning.py --max_epochs=2 --size=400 --accelerator gpu --devices 1",
diff --git a/tutorial/source/svi_torch.rst b/tutorial/source/svi_torch.rst
index f916d41e4e..559e75c24b 100644
--- a/tutorial/source/svi_torch.rst
+++ b/tutorial/source/svi_torch.rst
@@ -3,7 +3,7 @@ Example: using vanilla PyTorch to perform optimization in SVI
 
 This script uses argparse arguments to construct PyTorch optimizer and dataloader, for example::
 
-    $ python examples/svi_torch.py --size 10000 --batch_size 100 --max_epochs 100
+    $ python examples/svi_torch.py --size 10000 --batch_size 100 --num-epochs 100
 
 `View svi_torch.py on github`__
 

From 1610e919185acd4bb485405a14972fb28e7031a6 Mon Sep 17 00:00:00 2001
From: Eli <eli@elibingham.com>
Date: Thu, 15 Feb 2024 13:47:50 -0500
Subject: [PATCH 5/6] fix cuda

---
 examples/svi_torch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/svi_torch.py b/examples/svi_torch.py
index f791dac24c..2ef0143bd4 100644
--- a/examples/svi_torch.py
+++ b/examples/svi_torch.py
@@ -58,6 +58,8 @@ def main(args):
     with torch.no_grad():
         covariates = torch.randn(args.size)
         data = Model(args.size)(covariates)
+        covariates = covariates.to(device=torch.device("cuda" if args.cuda else "cpu"))
+        data = data.to(device=torch.device("cuda" if args.cuda else "cpu"))
 
     # Create a model and a guide, both as (Pyro)Modules.
     model: torch.nn.Module = Model(args.size)

From 93c264a304a2344b6d9d59b8e73ed6911ba56712 Mon Sep 17 00:00:00 2001
From: Eli <eli@elibingham.com>
Date: Thu, 15 Feb 2024 13:50:20 -0500
Subject: [PATCH 6/6] arg consistency

---
 examples/svi_torch.py         | 6 +++---
 tutorial/source/svi_torch.rst | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/svi_torch.py b/examples/svi_torch.py
index 2ef0143bd4..5cf17393aa 100644
--- a/examples/svi_torch.py
+++ b/examples/svi_torch.py
@@ -102,10 +102,10 @@ def main(args):
         description="Using vanilla PyTorch to perform optimization in SVI"
     )
     parser.add_argument("--size", default=10000, type=int)
-    parser.add_argument("--batch_size", default=100, type=int)
-    parser.add_argument("--learning_rate", default=0.01, type=float)
+    parser.add_argument("--batch-size", default=100, type=int)
+    parser.add_argument("--learning-rate", default=0.01, type=float)
     parser.add_argument("--seed", default=20200723, type=int)
-    parser.add_argument("--num_epochs", default=10, type=int)
+    parser.add_argument("--num-epochs", default=10, type=int)
     parser.add_argument("--cuda", action="store_true", default=False)
     args = parser.parse_args()
     main(args)
diff --git a/tutorial/source/svi_torch.rst b/tutorial/source/svi_torch.rst
index 559e75c24b..a0f74718fb 100644
--- a/tutorial/source/svi_torch.rst
+++ b/tutorial/source/svi_torch.rst
@@ -3,7 +3,7 @@ Example: using vanilla PyTorch to perform optimization in SVI
 
 This script uses argparse arguments to construct PyTorch optimizer and dataloader, for example::
 
-    $ python examples/svi_torch.py --size 10000 --batch_size 100 --num-epochs 100
+    $ python examples/svi_torch.py --size 10000 --batch-size 100 --num-epochs 100
 
 `View svi_torch.py on github`__