diff --git a/Manifest.toml b/Manifest.toml
index e8b64ee..403d937 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -29,9 +29,9 @@ version = "0.5.1"
 
 [[CMake]]
 deps = ["BinDeps"]
-git-tree-sha1 = "c67a8689dc5444adc5eb2be7d837100340ecba11"
+git-tree-sha1 = "50a8b41d2c562fccd9ab841085fc7d1e2706da82"
 uuid = "631607c0-34d2-5d66-819e-eb0f9aa2061a"
-version = "1.1.2"
+version = "1.2.0"
 
 [[CMakeWrapper]]
 deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Test"]
@@ -65,9 +65,9 @@ version = "2.2.0"
 
 [[Conda]]
 deps = ["JSON", "VersionParsing"]
-git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
+git-tree-sha1 = "7a58bb32ce5d85f8bf7559aa7c2842f9aecf52fc"
 uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
-version = "1.3.0"
+version = "1.4.1"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -102,9 +102,9 @@ version = "1.0.2"
 
 [[DiffRules]]
 deps = ["NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
+git-tree-sha1 = "eb0c34204c8410888844ada5359ac8b96292cfd1"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.0.0"
+version = "1.0.1"
 
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
@@ -118,18 +118,18 @@ version = "0.22.4"
 
 [[FileIO]]
 deps = ["Pkg"]
-git-tree-sha1 = "74585bf1f7ed7259e166011e89f49363d7fa89a6"
+git-tree-sha1 = "2c84c57aced468fa21763c66d3bef33adcd09ec7"
 uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
-version = "1.2.1"
+version = "1.2.2"
 
 [[FileWatching]]
 uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
+git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.4"
+version = "0.8.5"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa"
@@ -162,7 +162,7 @@ version = "0.5.1"
 
 [[Gen]]
 deps = ["DataStructures", "Distributions", "FunctionalCollections", "JSON", "LinearAlgebra", "MacroTools", "Random", "ReverseDiff", "SpecialFunctions"]
-git-tree-sha1 = "8b1686eca997088b31541e4a2b92d479bf2de24d"
+git-tree-sha1 = "677426251655bfbf71f69842c2dc02784fea3683"
 repo-rev = "master"
 repo-url = "https://github.com/probcomp/Gen"
 uuid = "ea4f424c-a589-11e8-07c0-fd5c91b9da4a"
@@ -357,9 +357,9 @@ version = "1.0.1"
 
 [[ReverseDiff]]
 deps = ["DiffResults", "DiffRules", "ForwardDiff", "FunctionWrappers", "LinearAlgebra", "NaNMath", "Random", "SpecialFunctions", "StaticArrays", "Statistics"]
-git-tree-sha1 = "6a5aae862f93c577f89a607334f09630439ec119"
+git-tree-sha1 = "5f7dafd314ff2ada3076797b1edfb71e52151cb9"
 uuid = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
-version = "1.0.0"
+version = "1.1.0"
 
 [[Rmath]]
 deps = ["BinaryProvider", "Libdl", "Random", "Statistics"]
@@ -420,9 +420,9 @@ version = "0.32.0"
 
 [[StatsFuns]]
 deps = ["Rmath", "SpecialFunctions"]
-git-tree-sha1 = "79982835d2ff3970685cb704500909c94189bde9"
+git-tree-sha1 = "f290ddd5fdedeadd10e961eb3f4d3340f09d030a"
 uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
-version = "0.9.3"
+version = "0.9.4"
 
 [[SuiteSparse]]
 deps = ["Libdl", "LinearAlgebra", "SparseArrays"]
diff --git a/tutorials/Data-Driven Proposals in Gen.jl b/tutorials/Data-Driven Proposals in Gen.jl
index f6d64b4..6d9040e 100644
--- a/tutorials/Data-Driven Proposals in Gen.jl	
+++ b/tutorials/Data-Driven Proposals in Gen.jl	
@@ -14,13 +14,43 @@
 
 # # Tutorial: Data-Driven Proposals in Gen
 #
-# This tutorial introduces you to an important inference programming feature in Gen --- using custom "data-driven" proposals to accelerate Monte Carlo inference. Data-driven proposals use information in the observed data set to choose the proposal distibution for latent variables in a generative model. Data-driven proposals can have trainable parameters that are trained data simulated from a generative model to improve their efficacy. This training process is sometimes called 'amortized inference' or 'inference compilation'.
-#
-# We focus on using data-driven proposals with importance sampling, which is one of the simpler classes of Monte Carlo inference algorithms. Data-driven proposals can also be used with Markov Chain Monte Carlo (MCMC) and sequential Monte Carlo (SMC), but these are not covered in this tutorial.
-#
-# This tutorial builds on a probabilistic model for the motion of an autonomous agent that was introduced in the [introduction to modeling tutorial](https://github.com/probcomp/gen-examples/tree/master/tutorial-modeling-intro). We show that we can improve the efficiency of inference in this model using two types of custom proposals for the destination of the agent: First, we write a hand-coded data-driven proposal with a single parameter that we tune using amortized inference. Second, we write a data-driven proposal based on a deep neural network, which we also train using amortized inference. We show an implementation of the neural-network based proposal using the built-in modeling language, and then an implementation using the TensorFlow modeling DSL.
-#
-# Data-driven proposals that are trained on simulated data can be seen from two perspectives: For practitioners of inference in generative models, they can be seen learning a proposal distribution that approximates the conditional distribution on latent variables given observations. Practitioners of supervised machine learning may view data-driven proposals as discriminative models trained on simulated data. By writing the data simulator in a probabilistic programming language, we can embed the probabilistic discriminative model in Monte Carlo algorithms, which can assist with robustness and accuracy. This is an active area of research.
+# This tutorial introduces you to an important inference programming feature in
+# Gen --- using custom "data-driven" proposals to accelerate Monte Carlo
+# inference. Data-driven proposals use information in the observed data set to
+# choose the proposal distibution for latent variables in a generative model.
+# Data-driven proposals can have trainable parameters that are trained data
+# simulated from a generative model to improve their efficacy. This training
+# process is sometimes called 'amortized inference' or 'inference compilation'.
+#
+# We focus on using data-driven proposals with importance sampling, which is
+# one of the simpler classes of Monte Carlo inference algorithms. Data-driven
+# proposals can also be used with Markov Chain Monte Carlo (MCMC) and
+# sequential Monte Carlo (SMC), but these are not covered in this tutorial.
+#
+# This
+# tutorial
+# builds on a probabilistic model for the motion of an autonomous agent that
+# was introduced in
+# the [introduction to modeling
+# tutorial](https://github.com/probcomp/gen-examples/tree/master/tutorial-modeling-intro).
+# We show that we can improve the efficiency of inference in this model using
+# two types of custom proposals for the destination of the agent: First, we
+# write a hand-coded data-driven proposal with a single parameter that we tune
+# using amortized inference. Second, we write a data-driven proposal based on a
+# deep neural network, which we also train using amortized inference. We show
+# an implementation of the neural-network based proposal using the built-in
+# modeling language, and then an implementation using the TensorFlow modeling
+# DSL.
+#
+# Data-driven proposals that are trained on simulated data can be seen from two
+# perspectives: For practitioners of inference in generative models, they can
+# be seen learning a proposal distribution that approximates the conditional
+# distribution on latent variables given observations. Practitioners of
+# supervised machine learning may view data-driven proposals as discriminative
+# models trained on simulated data. By writing the data simulator in a
+# probabilistic programming language, we can embed the probabilistic
+# discriminative model in Monte Carlo algorithms, which can assist with
+# robustness and accuracy. This is an active area of research.
 #
 # ## Outline
 #
@@ -44,9 +74,12 @@ using JLD
 viz_server = VizServer(8092)
 sleep(1)
 
+# ##### Warning: do not execute the above cell twice (restart the kernel if you do).
+
 # ## 1: Recap on the generative model of an autonomous agent   <a name="model-recap"></a>
 
-# We begin by loading the source files for the generative model of an autonomous agent that was introduced in a previous tutorial:
+# We begin by loading the source files for the generative model of an
+# autonomous agent that was introduced in a previous tutorial:
 
 include("../inverse-planning/geometric_primitives.jl");
 include("../inverse-planning/scene.jl");
@@ -93,7 +126,9 @@ include("../inverse-planning/planning.jl");
     return (planning_failed, maybe_path)
 end;
 
-# And we redefine a function that converts a trace of this model into a value that is easily serializable to JSON, for use with the GenViz visualization framework:
+# And we redefine a function that converts a trace of this model into a value
+# that is easily serializable to JSON, for use with the GenViz visualization
+# framework:
 
 function trace_to_dict(trace)
     args = Gen.get_args(trace)
@@ -143,7 +178,10 @@ add_obstacle!(scene, make_line(horizontal, Point(0.60 - 0.15, 0.80), 0.15 + wall
 add_obstacle!(scene, make_line(horizontal, Point(0.20, 0.80), 0.15, wall_thickness))
 add_obstacle!(scene, make_line(vertical, Point(0.20, 0.40), 0.40, wall_thickness));
 
-# We will assume the agent starts in the lower left-hand corner. And we will assume particular parameters for the planning algorithm that the agent uses. We will also assume that there are 10 measurements, separated `0.1` time units.
+# We will assume the agent starts in the lower left-hand corner. And we will
+# assume particular parameters for the planning algorithm that the agent uses.
+# We will also assume that there are 10 measurements, separated `0.1` time
+# units.
 
 start = Point(0.1, 0.1)
 dt = 0.1
@@ -164,12 +202,13 @@ measurements = [
     Point(0.124597, 0.462056),
     Point(0.126227, 0.498338)];
 
-# We redefine the generic importance sampling algorithm that we used in the previous notebook:
+# We redefine the generic importance sampling algorithm that we used in the
+# previous notebook:
 
 function do_inference(scene::Scene, dt::Float64, num_ticks::Int, planner_params::PlannerParams, start::Point,
                       measurements::Vector{Point}, amount_of_computation::Int)
     
-    # Create a choice map that maps model addresses (:y, i)
+    # Create a choice map that maps model addresses (i, :y)
     # to observed values ys[i]. We leave :slope and :intercept
     # unconstrained, because we want them to be inferred.
     observations = Gen.choicemap()
@@ -188,7 +227,10 @@ function do_inference(scene::Scene, dt::Float64, num_ticks::Int, planner_params:
     return trace
 end;
 
-# Below, we run this algorithm 1000 times, to generate 1000 approximate samples from the posterior distribution on the destination. The inferred destinations should appear as red dots on the map. First, we abstract this into a function.
+# Below, we run this algorithm 1000 times, to generate 1000 approximate samples
+# from the posterior distribution on the destination. The inferred destinations
+# should appear as red dots on the map. First, we abstract this into a
+# function.
 
 function visualize_inference(measurements, scene, start; computation_amt=50, samples=1000)
     info = Dict("measurements" => measurements, "scene" => scene, "start" => start)
@@ -206,9 +248,17 @@ visualize_inference(measurements, scene, start, computation_amt=50, samples=1000
 
 # ## 2. Writing a data-driven proposal as a generative function <a name="custom-proposal"></a>
 
-# The inference algorithm above used a variant of [`Gen.importance_resampling`](https://probcomp.github.io/Gen/dev/ref/inference/#Gen.importance_resampling) that does not take a custom proposal distribution. It uses the default proposal distribution associated with the generative model. For generative functions defined using the built-in modeling DSL, the default proposal distribution is based on *ancestral sampling*, which involves sampling unconstrained random choices from the distributions specified in the generative mode.
+# The inference algorithm above used a variant of
+# [`Gen.importance_resampling`](https://probcomp.github.io/Gen/dev/ref/inference/#Gen.importance_resampling)
+# that does not take a custom proposal distribution. It uses the default
+# proposal distribution associated with the generative model. For generative
+# functions defined using the built-in modeling DSL, the default proposal
+# distribution is based on *ancestral sampling*, which involves sampling
+# unconstrained random choices from the distributions specified in the
+# generative mode.
 
-# We can sample from the default proposal distribution using `Gen.initialize`. The cell below shows samples of the destination from this distribution.
+# We can sample from the default proposal distribution using `Gen.initialize`.
+# The cell below shows samples of the destination from this distribution.
 
 info = Dict("measurements" => measurements, "scene" => scene, "start" => start)
 viz = Viz(viz_server, joinpath(@__DIR__, "../inverse-planning/overlay-viz/dist"), info)
@@ -218,20 +268,43 @@ for i=1:1000
 end
 displayInNotebook(viz)
 
-# Intuitively, if we see the data set above (where blue is the starting location), we might guess that the agent is more likely to be heading into the upper part of the scene. This is because we don't expect the agent to unecessarily backtrack on its route to its destnation. A simple heuristic for biasing the proposal distribution of the destination using just the first measurement and the last measurement might be:
+# Intuitively, if we see the data set above (where blue is the starting
+# location), we might guess that the agent is more likely to be heading into
+# the upper part of the scene. This is because we don't expect the agent to
+# unecessarily backtrack on its route to its destnation. A simple heuristic for
+# biasing the proposal distribution of the destination using just the first
+# measurement and the last measurement might be:
 #
-# - If the x-coordinate of the last measurement is greater than the x-coordinate of the first measurement, we think the agent is probably headed generally to the right. Make values `:dest_x` that are greater than the x-coordinate of the last measurement more probable.
+# - If the x-coordinate of the last measurement is greater than the
+#   x-coordinate of the first measurement, we think the agent is probably
+#   headed generally to the right. Make values `:dest_x` that are greater than
+#   the x-coordinate of the last measurement more probable.
 #
-# - If the x-coordinate of the last measurment is less than the x-coordinate of the first measurement, we think the agent is probably headed generally to the left. Make values  `:dest_x` that are smaller than the x-coordinate of the last measurement more probable.
+# - If the x-coordinate of the last measurment is less than the x-coordinate of
+#   the first measurement, we think the agent is probably headed generally to
+#   the left. Make values  `:dest_x` that are smaller than the x-coordinate of
+#   the last measurement more probable.
 #
 # We can apply the same heuristic separately for the y-coordinate.
 
-# To implement this idea, we discretize the x-axis and y-axis of the scene into bins:
+# To implement this idea, we discretize the x-axis and y-axis of the scene into
+# bins:
 
 num_x_bins = 5
 num_y_bins = 5;
 
-# We will propose the x-coordinate of the destination from a [piecewise_uniform](https://probcomp.github.io/Gen/dev/ref/distributions/#Gen.piecewise_uniform) distribution, where we set higher probability for certain bins based on the heuristic described above and use a uniform continuous distribution for the coordinate within a bin. The `compute_bin_probs` function below computes the probability for each bin. The bounds of the scene are given by `min` and `max`. The coordinates of the first and last measured points respectively are given by `first` and `last`. We compute the probability by assigning a "score" to each bin based on the heuristic above --- if the bin should receive lower probability, it gets a score of 1., and if it should receive higher probability, it gets a bin of `score_high`, where `score_high` is some value greater than 1.
+# We will propose the x-coordinate of the destination from a
+# [piecewise_uniform](https://probcomp.github.io/Gen/dev/ref/distributions/#Gen.piecewise_uniform)
+# distribution, where we set higher probability for certain bins based on the
+# heuristic described above and use a uniform continuous distribution for the
+# coordinate within a bin. The `compute_bin_probs` function below computes the
+# probability for each bin. The bounds of the scene are given by `min` and
+# `max`. The coordinates of the first and last measured points respectively are
+# given by `first` and `last`. We compute the probability by assigning a
+# "score" to each bin based on the heuristic above --- if the bin should
+# receive lower probability, it gets a score of 1., and if it should receive
+# higher probability, it gets a bin of `score_high`, where `score_high` is some
+# value greater than 1.
 
 # +
 function compute_bin_prob(first::Float64, last::Float64, bin::Int, last_bin::Int, score_high)
@@ -249,11 +322,16 @@ function compute_bin_probs(num_bins::Int, min::Float64, max::Float64, first::Flo
 end;
 # -
 
-# We will see how to automatically tune the value of `score_high` shortly. For now, we will use a value of 5. Below, we see that for the data set of measurements, shown above the probabilities of higher bins are indeed 5x larger than those of lower bins, becuase the agent seems to be headed up. 
+# We will see how to automatically tune the value of `score_high` shortly. For
+# now, we will use a value of 5. Below, we see that for the data set of
+# measurements, shown above the probabilities of higher bins are indeed 5x
+# larger than those of lower bins, becuase the agent seems to be headed up.
 
 compute_bin_probs(num_y_bins, scene.ymin, scene.ymax, measurements[1].y, measurements[end].y, 5.)
 
-# Next, we write a generative function that applies this heuristic for both the x-coordinate and y-coordinate, and samples the destination coordinates at addresses `:dest_x` and `:dest_y`.
+# Next, we write a generative function that applies this heuristic for both the
+# x-coordinate and y-coordinate, and samples the destination coordinates at
+# addresses `:dest_x` and `:dest_y`.
 
 @gen function custom_dest_proposal(measurements::Vector{Point}, scene::Scene)
 
@@ -277,12 +355,22 @@ compute_bin_probs(num_y_bins, scene.ymin, scene.ymax, measurements[1].y, measure
     return nothing
 end;
 
-# We can propose values of random choices from the proposal function using [`Gen.propose`](https://probcomp.github.io/Gen/dev/ref/gfi/#Gen.propose). This method returns the choices, as well as some other information, which we won't need for our purposes. For now, you can think of `Gen.propose` as similar to `Gen.initialize` except that it does not produce a full execution trace (only the choices), and it does not accept constraints. We can see the random choices for one run of the proposal on our data set:
+# We can propose values of random choices from the proposal function using
+# [`Gen.propose`](https://probcomp.github.io/Gen/dev/ref/gfi/#Gen.propose).
+# This method returns the choices, as well as some other information, which we
+# won't need for our purposes. For now, you can think of `Gen.propose` as
+# similar to `Gen.initialize` except that it does not produce a full execution
+# trace (only the choices), and it does not accept constraints. We can see the
+# random choices for one run of the proposal on our data set:
 
 (proposed_choices, _, _) = Gen.propose(custom_dest_proposal, (measurements, scene))
 println(proposed_choices)
 
-# The function below runs the proposal 1000 times. For each run, it generates a trace of the model where the `:dest_x` and `:dest_y` choices are constrained to the proposed values, and then visualizes the resulting traces. We make the proposal a parameter of the function because we will be visualizing the output distribution of various proposals later in the notebook.
+# The function below runs the proposal 1000 times. For each run, it generates a
+# trace of the model where the `:dest_x` and `:dest_y` choices are constrained
+# to the proposed values, and then visualizes the resulting traces. We make the
+# proposal a parameter of the function because we will be visualizing the
+# output distribution of various proposals later in the notebook.
 
 function visualize_custom_destination_proposal(measurements, start, dest_proposal; num_samples=100)
     info = Dict("measurements" => measurements, "scene" => scene, "start" => start)
@@ -295,15 +383,29 @@ function visualize_custom_destination_proposal(measurements, start, dest_proposa
     displayInNotebook(viz)    
 end;
 
-# Let's visualize the output distribution of `custom_dest_proposal` for our data set:
+# Let's visualize the output distribution of `custom_dest_proposal` for our
+# data set:
 
 visualize_custom_destination_proposal(measurements, start, custom_dest_proposal, num_samples=1000)
 
-# We see that the proposal distribution indeed samples destinations in the top half of the scene with higher probability than destinations in the bottom half.
+# We see that the proposal distribution indeed samples destinations in the top
+# half of the scene with higher probability than destinations in the bottom
+# half.
 
 # ## 3. Using a data-driven proposal within importance sampling <a name="using"></a>
 
-# We now use our data-driven proposal within an inference algorithm. There is a second variant of [Gen.importance_resampling](https://probcomp.github.io/Gen/dev/ref/inference/#Gen.importance_resampling) that accepts a generative function representing a custom proposal. This proposal generative function makes traced random choices at the addresses of a subset of the unobserved random choices made by the generative model. In our case, these addresses are `:dest_x` and `:dest_y`. Below, we write an inference program that uses this second variant of importance resampling. Because we will experiment with different data-driven proposals, we make the proposal into an agument of our inference program. We assume that the proposal accepts arguments `(measurements, scene)`.
+# We now use our data-driven proposal within an inference algorithm. There is a
+# second variant of
+# [Gen.importance_resampling](https://probcomp.github.io/Gen/dev/ref/inference/#Gen.importance_resampling)
+# that accepts a generative function representing a custom proposal. This
+# proposal generative function makes traced random choices at the addresses of
+# a subset of the unobserved random choices made by the generative model. In
+# our case, these addresses are `:dest_x` and `:dest_y`.
+# Below, we write an inference program that uses this second variant of
+# importance resampling.
+# Because we will experiment with different data-driven proposals, we make the
+# proposal into an agument of our inference program. We assume that the
+# proposal accepts arguments `(measurements, scene)`.
 
 function do_inference_data_driven(dest_proposal::GenerativeFunction,
                                               scene::Scene, dt::Float64,
@@ -325,7 +427,8 @@ function do_inference_data_driven(dest_proposal::GenerativeFunction,
     return trace
 end;
 
-# We also write a function below that runs this algorithm a number of times, and visualizes the result:
+# We also write a function below that runs this algorithm a number of times,
+# and visualizes the result:
 
 function visualize_data_driven_inference(measurements, scene, start, proposal; amt_computation=50, samples=1000)
     info = Dict("measurements" => measurements, "scene" => scene, "start" => start)
@@ -344,15 +447,27 @@ end;
 
 visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal, amt_computation=5, samples=1000)
 
-# We compare this to the original algorithm that used the default proposal, for the same "amount of computation" of 5.
+# We compare this to the original algorithm that used the default proposal, for
+# the same "amount of computation" of 5.
 
 visualize_inference(measurements, scene, start, computation_amt=5, samples=1000)
 
-# We see that the results are somewhat more accurate using the data-driven proposal.  In particular, there is less probability mass in the lower left corner when using the data-driven proposal.
+# We should see that the results are somewhat more accurate using the
+# data-driven proposal.  In particular, there is less probability mass in the
+# lower left corner when using the data-driven proposal.
 
 # ## 4. Training the parameters of a data-driven proposal <a name="training"></a>
 
-# Our choice of the `score_high` value of 5. was somewhat arbitrary. To use more informed value, we can make `score_high` into a [*trainable parameter*](https://probcomp.github.io/Gen/dev/ref/modeling/#Trainable-parameters-1) of the generative function. Below, we write a new version of the proposal function that makes `score_high` trainable. However, the optimization algorithms we will use for training work best with *unconstrained* parameters (parameters that can take any value on the real line), but `score_high` must be positive. Therefore, we introduce an unconstrained trainable parameter mamed `log_score_high`, and use `exp()` to ensure that `score_high` is positive:
+# Our choice of the `score_high` value of 5. was somewhat arbitrary. To use
+# more informed value, we can make `score_high` into a [*trainable
+# parameter*](https://probcomp.github.io/Gen/dev/ref/modeling/#Trainable-parameters-1)
+# of the generative function. Below, we write a new version of the proposal
+# function that makes `score_high` trainable. However, the optimization
+# algorithms we will use for training work best with *unconstrained* parameters
+# (parameters that can take any value on the real line), but `score_high` must
+# be positive. Therefore, we introduce an unconstrained trainable parameter
+# mamed `log_score_high`, and use `exp()` to ensure that `score_high` is
+# positive:
 
 @gen function custom_dest_proposal_trainable(measurements::Vector{Point}, scene::Scene)
 
@@ -376,15 +491,32 @@ visualize_inference(measurements, scene, start, computation_amt=5, samples=1000)
     return nothing
 end;
 
-# We initialize the value of `score_high` to 1. For this value, our custom proposal gives a uniform distribution, and is the same as the default proposal.
+# We initialize the value of `score_high` to 1. For this value, our custom
+# proposal gives a uniform distribution, and is the same as the default
+# proposal.
 
 Gen.init_param!(custom_dest_proposal_trainable, :log_score_high, 0.);
 
-# Let's visualize the proposed distribution prior to training to confirm that it is a uniform distribution.
+# Let's visualize the proposed distribution prior to training to confirm that
+# it is a uniform distribution.
 
 visualize_custom_destination_proposal(measurements, start, custom_dest_proposal_trainable, num_samples=1000)
 
-# Now, we train the generative function. First, we will require a data-generator that generates the training data. The data-generator is a function of no arguments that returns a tuple of the form `(inputs, constraints)`. The `inputs` are the arguments to the generative function being trained, and the `constraints` contains the desired values of random choices made by the function for those arguments. For the training distribution, we will use the distribution induced by the generative model (`agent_model`), restricted to cases where planning actually succeeded. When planning failed, the agent just stays at the same location for all time, and we won't worry about tuning our proposal for that case. The training procedure will attempt to maximize the expected conditional log probablity (density) that the proposal function generates the constrained values, when run on the arguments. Note that this is an *average case* objective function --- the resulting proposal distribution may perform better on some data sets than others.
+# Now, we train the generative function. First, we will require a
+# data-generator that generates the training data. The data-generator is a
+# function of no arguments that returns a tuple of the form `(inputs,
+# constraints)`. The `inputs` are the arguments to the generative function
+# being trained, and the `constraints` contains the desired values of random
+# choices made by the function for those arguments. For the training
+# distribution, we will use the distribution induced by the generative model
+# (`agent_model`), restricted to cases where planning actually succeeded. When
+# planning failed, the agent just stays at the same location for all time, and
+# we won't worry about tuning our proposal for that case. The training
+# procedure will attempt to maximize the expected conditional log probablity
+# (density) that the proposal function generates the constrained values, when
+# run on the arguments. Note that this is an *average case* objective function
+# --- the resulting proposal distribution may perform better on some data sets
+# than others.
 
 function data_generator()
     
@@ -413,15 +545,29 @@ function data_generator()
     return (inputs, constraints)
 end;
 
-# Next, we choose type of optimization algorithm we will use for training. Gen supports a set of gradient-based optimization algorithms (see [Optimizing Trainable Parameters](https://probcomp.github.io/Gen/dev/ref/parameter_optimization/#Optimizing-Trainable-Parameters-1)). Here we will use gradient descent with a fixed step size of 0.001.
+# Next, we choose type of optimization algorithm we will use for training. Gen
+# supports a set of gradient-based optimization algorithms (see [Optimizing
+# Trainable
+# Parameters](https://probcomp.github.io/Gen/dev/ref/parameter_optimization/#Optimizing-Trainable-Parameters-1)).
+# Here we will use gradient descent with a fixed step size of 0.001.
 
 update = Gen.ParamUpdate(Gen.FixedStepGradientDescent(0.001), custom_dest_proposal_trainable);
 
-# Finally, we use the [`Gen.train!`](https://probcomp.github.io/Gen/dev/ref/inference/#Gen.train!) method to actually do the training.
+# Finally, we use the
+# [`Gen.train!`](https://probcomp.github.io/Gen/dev/ref/inference/#Gen.train!)
+# method to actually do the training.
 #
-# For each epoch, `Gen.train!` makes `epoch_size` calls to the data-generator to construct a batch of training data for that epoch. Then, it iteratively selects `num_minibatch` subsets of the epoch training data, each of size `100`, and applies the update once per minibatch. At the end of the epoch, it generates another batch of evaluation data (of size `evaluation_size`) which it uses to estimate the objective function (the expected conditional log likelihood under the data-generating distribution).
+# For each epoch, `Gen.train!` makes `epoch_size` calls to the data-generator
+# to construct a batch of training data for that epoch. Then, it iteratively
+# selects `num_minibatch` subsets of the epoch training data, each of size
+# `100`, and applies the update once per minibatch. At the end of the epoch, it
+# generates another batch of evaluation data (of size `evaluation_size`) which
+# it uses to estimate the objective function (the expected conditional log
+# likelihood under the data-generating distribution).
 #
-# Here, we are running 200 gradient-descent updates, where each update is using a gradient estimate obtained from 100 training examples. The method prints the estimate of the objective function after each epoch.
+# Here, we are running 200 gradient-descent updates, where each update is using
+# a gradient estimate obtained from 100 training examples. The method prints
+# the estimate of the objective function after each epoch.
 
 @time scores = Gen.train!(custom_dest_proposal_trainable, data_generator, update,
     num_epoch=200, epoch_size=100, num_minibatch=1, minibatch_size=100, evaluation_size=100, verbose=true);
@@ -434,7 +580,9 @@ ylabel("Estimate of expected conditional log probability density");
 
 println(exp(Gen.get_param(custom_dest_proposal_trainable, :log_score_high)))
 
-# We see that the optimal value of the parameter is indeed larger than our initial guess. This validates that the heuristic is indeed a useful one. We visualize the proposal distribution below:
+# We see that the optimal value of the parameter is indeed larger than our
+# initial guess. This validates that the heuristic is indeed a useful one. We
+# visualize the proposal distribution below:
 
 visualize_custom_destination_proposal(measurements, start, custom_dest_proposal_trainable, num_samples=1000)
 
@@ -446,7 +594,9 @@ visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal
 # ------------
 # ### Exercise
 #
-# Can you devise a data-driven proposal for the speed of the agent? Do you expect it to work equally well on all data sets? You do not need to implement it.
+# Can you devise a data-driven proposal for the speed of the agent? Do you
+# expect it to work equally well on all data sets? You do not need to implement
+# it.
 
 # ### Solution
 
@@ -454,13 +604,21 @@ visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal
 
 # ## 5. Writing and training a deep learning based data-driven proposal <a name="deep"></a>
 
-# The heuristic data-driven proposal above gave some improvement in efficiency, but it was very simple. One way of constructing complex data-driven proposals is to parametrize the proposal with a deep neural network or use another class of high-capacity machine learning model (e.g. random forest). Here, we will will write a data-driven proposal for the destination of the agent that uses deep neural networks.
+# The heuristic data-driven proposal above gave some improvement in efficiency,
+# but it was very simple. One way of constructing complex data-driven proposals
+# is to parametrize the proposal with a deep neural network or use another
+# class of high-capacity machine learning model (e.g. random forest). Here, we
+# will will write a data-driven proposal for the destination of the agent that
+# uses deep neural networks.
 
 # First, we define a sigmoid function for the nonlinearity in our networks.
 
 nonlinearity(x) = 1.7159 * tanh.(x * 0.66666);
 
-# We will use a deep neural network with two hidden layers that takes as input x- and y- coordinates of the first and last measurement (4 values) and produces as output a vector of un-normalized probabilities, one for each bin of the x-dimension. We will later sample `:dest_x` from this distribution.
+# We will use a deep neural network with two hidden layers that takes as input
+# x- and y- coordinates of the first and last measurement (4 values) and
+# produces as output a vector of un-normalized probabilities, one for each bin
+# of the x-dimension. We will later sample `:dest_x` from this distribution.
 
 function dest_x_neural_net(nn_params, x_first::Real, y_first::Real, x_last::Real, y_last::Real)
     (W1, b1, W2, b2, W3, b3) = nn_params
@@ -471,7 +629,11 @@ function dest_x_neural_net(nn_params, x_first::Real, y_first::Real, x_last::Real
     return output_layer
 end;
 
-# After sampling the value of `:dest_x`, we will use a second deep neural network that takes as input the same four values extracted from the measurements, as well as the sampled value of `:dest_x` (a total of 5 inputs), and produces a vector of un-normalized probabilities, one for each bin of the y-dimension. We will sample `:dest_y` from this distribution.
+# After sampling the value of `:dest_x`, we will use a second deep neural
+# network that takes as input the same four values extracted from the
+# measurements, as well as the sampled value of `:dest_x` (a total of 5
+# inputs), and produces a vector of un-normalized probabilities, one for each
+# bin of the y-dimension. We will sample `:dest_y` from this distribution.
 
 function dest_y_neural_net(nn_params, x_first::Real, y_first::Real, x_last::Real, y_last::Real)#, dest_x::Real)
     (W1, b1, W2, b2, W3, b3) = nn_params
@@ -482,7 +644,8 @@ function dest_y_neural_net(nn_params, x_first::Real, y_first::Real, x_last::Real
     return output_layer
 end;
 
-# Now that we have defined our neural networks, we define our new proposal. This generative function has a number of parameters.
+# Now that we have defined our neural networks, we define our new proposal.
+# This generative function has a number of parameters.
 
 # +
 scale_coord(coord, min, max) = (coord / (max - min)) - 0.5
@@ -568,13 +731,20 @@ init_param!(custom_dest_proposal_neural, :y_b3, zeros(num_y_bins));
 
 visualize_custom_destination_proposal(measurements, start, custom_dest_proposal_neural, num_samples=1000)
 
-# It looks like the initial distribution is roughly uniform, like the default proposal.
+# It looks like the initial distribution is roughly uniform, like the default
+# proposal.
 
-# Now we train the network stochastic gradient descent with a fixed step size of 0.001 that is shared among all of the trainable parameters.
+# Now we train the network stochastic gradient descent with a fixed step size
+# of 0.001 that is shared among all of the trainable parameters.
 
 update = Gen.ParamUpdate(Gen.FixedStepGradientDescent(0.001), custom_dest_proposal_neural);
 
-# We use 100 epochs of training. In each epoch, we generate 1000 training examples, and we apply 100 gradient updates, where each update is based on the gradient estimate obtained from a random set of 100 of the trainable examples. At the end of each epoch, we estimate the objective function value using 10000 freshly sampled examples. This process takes about 10 minutes to run, so we have precomputed the results for you
+# We use 100 epochs of training. In each epoch, we generate 1000 training
+# examples, and we apply 100 gradient updates, where each update is based on
+# the gradient estimate obtained from a random set of 100 of the trainable
+# examples. At the end of each epoch, we estimate the objective function value
+# using 10000 freshly sampled examples. This process takes about 10 minutes to
+# run, so we have precomputed the results for you
 #
 # ```julia
 # @time scores = Gen.train!(custom_dest_proposal_neural, data_generator, update,
@@ -614,18 +784,23 @@ visualize_custom_destination_proposal(measurements, start, custom_dest_proposal_
 visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal_neural,
     amt_computation=5, samples=1000)
 
-# As we increase the amount of computation, the effect of the proposal's bias is reduced:
+# As we increase the amount of computation, the effect of the proposal's bias
+# is reduced:
 
 visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal_neural,
     amt_computation=50, samples=1000)
 
-# In the limit of infinite computation, the distribution converges to the posterior:
+# In the limit of infinite computation, the distribution converges to the
+# posterior:
 
 # ------
 #
 # ### Exercise
 #
-# Visualize the trained proposal distribution `custom_dest_proposal_neural` on some other data sets, either generated by simulating from the model, or constructed manually. Note that the proposal only reads from the first and last of the measurements. Discuss the results.
+# Visualize the trained proposal distribution `custom_dest_proposal_neural` on
+# some other data sets, either generated by simulating from the model, or
+# constructed manually. Note that the proposal only reads from the first and
+# last of the measurements. Discuss the results.
 
 # ### Solution
 
@@ -634,7 +809,10 @@ visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal
 # ---------
 # ### Exercise
 #
-# What is the objective value for the default proposal, which is the uniform distribution on the entire scene area? Recall that the objective function is the expected conditional log probability density of the proposal evaluated on destination points drawn by simulating from the model.
+# What is the objective value for the default proposal, which is the uniform
+# distribution on the entire scene area? Recall that the objective function is
+# the expected conditional log probability density of the proposal evaluated on
+# destination points drawn by simulating from the model.
 #
 # Hint: The log probability density of the uniform distribution is a constant.
 
@@ -644,7 +822,12 @@ visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal
 
 # ## 6. Writing a data-driven proposal that uses TensorFlow <a name="tf"></a>
 #
-# The data-driven neural proposal above used Julia code to implement the neural network. It is also possible to implement the neural networks in TensorFlow using the [GenTF](https://github.com/probcomp/GenTF) package that was introduced in a previous tutorial. We load GenTF, [PyCall](https://github.com/JuliaPy/PyCall.jl), and we import the TensorFlow Python module: 
+# The data-driven neural proposal above used Julia code to implement the neural
+# network. It is also possible to implement the neural networks in TensorFlow
+# using the [GenTF](https://github.com/probcomp/GenTF) package that was
+# introduced in a previous tutorial. We load GenTF,
+# [PyCall](https://github.com/JuliaPy/PyCall.jl), and we import the TensorFlow
+# Python module:
 
 using GenTF
 using PyCall
@@ -755,19 +938,23 @@ saver.restore(GenTF.get_session(y_nn), "params/y_nn.ckpt")
 visualize_custom_destination_proposal(measurements, start, custom_dest_proposal_tf, num_samples=1000)
 
 # --------
-#
 # ### Exercise
 #
 # The training procedure for the neural networks above was not vectorized across training examples. For fast training on a GPU it is important to vectorize the evaluation of gradients across multiple training examples. Write a vectorized version of the `custom_dest_proposal_tf` that takes a scene and a vector of data sets (measurement vectors), and samples a destination point for each of the input data sets. Train it and visualize the proposal distribution, and the results of importance resampling inference that using the proposal for some amount of coputation, for our example data set.
 #
 # Hint:
 #
-# - Construct a vectorized version of each of the neural networks that operate on an extra 'training example' dimension.
+# - Construct a vectorized version of each of the neural networks that operate
+#   on an extra 'training example' dimension.
 #
-# - Construct a vectorized version of the proposal. It should accept a vector of measurement vectors as one of its arguments. This vectorized propsal should make 2N random choices where N is the batch size.
+# - Construct a vectorized version of the proposal. It should accept a vector
+#   of measurement vectors as one of its arguments. This vectorized propsal
+#   should make 2N random choices where N is the batch size.
 #
-# - Construct a vectorized version of the data generator. It should generate constraints for all random choices of the vectorized proposal.
+# - Construct a vectorized version of the data generator. It should generate
+#   constraints for all random choices of the vectorized proposal.
 #
-# - Construct a non-vectorized version of the proposal that invokes the vectorized neural networks on a single data set.
+# - Construct a non-vectorized version of the proposal that invokes the
+#   vectorized neural networks on a single data set.
 
 # ### Solution
diff --git a/tutorials/Introduction to Modeling in Gen.jl b/tutorials/Introduction to Modeling in Gen.jl
index 0cb1292..03d7af1 100644
--- a/tutorials/Introduction to Modeling in Gen.jl	
+++ b/tutorials/Introduction to Modeling in Gen.jl	
@@ -14,23 +14,52 @@
 
 # # Tutorial: Introduction to Modeling in Gen
 
-# Gen is a multi-paradigm platform for probabilistic modeling and inference. Gen supports multiple modeling and inference workflows, including:
+# Gen is a multi-paradigm platform for probabilistic modeling and inference.
+# Gen supports multiple modeling and inference workflows, including:
 #
-# - Unsupervised learning and posterior inference in generative models using Monte Carlo,  variational, EM, and stochastic gradient techniques.
+# - Unsupervised learning and posterior inference in generative models using
+#   Monte Carlo,  variational, EM, and stochastic gradient techniques.
 #
-# - Supervised learning of conditional inference models (e.g. supervised classification and regression).
+# - Supervised learning of conditional inference models (e.g. supervised
+#   classification and regression).
 #
-# - Hybrid approaches including amortized inference / inference compilation, variational autoencoders, and semi-supervised learning.
+# - Hybrid approaches including amortized inference / inference compilation,
+#   variational autoencoders, and semi-supervised learning.
 #
-# In Gen, probabilistic models (both generative models and conditional inference models) are represented as _generative functions_. Gen provides a built-in modeling language for defining generative functions (Gen can also be extended to support other modeling languages, but this is not covered in this tutorial). This tutorial introduces the basics of Gen's built-in modeling language, and illustrates a few types of modeling flexibility afforded by the language, including:
+# In Gen, probabilistic models (both generative models and conditional
+# inference models) are represented as _generative functions_. Gen provides a
+# built-in modeling language for defining generative functions (Gen can also be
+# extended to support other modeling languages, but this is not covered in this
+# tutorial
+# ). This
+# tutorial
+# introduces the basics of Gen's built-in modeling language, and illustrates a
+# few types of modeling flexibility afforded by the language, including:
 #
-# - Using a stochastic branching and function abstraction to express uncertainty about which of multiple models is appropriate.
+# - Using a stochastic branching and function abstraction to express
+#   uncertainty about which of multiple models is appropriate.
 #
-# - Representing models with an unbounded number of parameters (a 'Bayesian non-parametric' model).
+# - Representing models with an unbounded number of parameters (a 'Bayesian
+#   non-parametric' model).
 #
-# This notebook uses a simple generic inference algorithm for posterior inference, and shows some examples of inference being applied to simple models. The notebook also introduces a technique for validating a model and inference algorithm by predicting new data from inferred parameters, and comparing this data to the observed data set.
+# This notebook uses a simple generic inference algorithm for posterior
+# inference, and shows some examples of inference being applied to simple
+# models. The notebook also introduces a technique for validating a model and
+# inference algorithm by predicting new data from inferred parameters, and
+# comparing this data to the observed data set.
 #
-# This tutorial does not cover *custom inference programming*, which is a key capability of Gen in which users implement inference algorithms that are specialized to their probabilistic model. Inference programming is important for getting accurate posterior inferences efficiently, and will be covered in later tutorials. Also, this tutorial does not exhaustively cover all features of the modeling language -- there are also features and extensions that provide improved performance that are not covered here.
+# This
+# tutorial
+# does not cover *custom inference programming*, which is a key capability of
+# Gen in which users implement inference algorithms that are specialized to
+# their probabilistic model. Inference programming is important for getting
+# accurate posterior inferences efficiently, and will be covered in later
+# tutorials.
+# Also, this
+# tutorial
+# does not exhaustively cover all features of the modeling language -- there
+# are also features and extensions that provide improved performance that are
+# not covered here.
 #
 # ## Outline
 #
@@ -52,19 +81,36 @@
 
 using Gen
 
-# Gen programs typically consist of a combination of (i) probabilistic models written in modeling languages and (ii) inference programs written in regular Julia code. Gen provides a built-in modeling language that is itself based on Julia.
+# This cell will take a few seconds to run. Note that in Jupyter, a cell that
+# is running is marked with **`In [*]`** on the left of the cell. When the cell
+# above ran, it should read **`In [1]`** (assuming it was the first cell that
+# was executed).
+#
+# Gen programs typically consist of a combination of (i) probabilistic models
+# written in modeling languages and (ii) inference programs written in regular
+# Julia code. Gen provides a built-in modeling language that is itself based on
+# Julia.
 
-# This tutorial uses a Jupyter notebook. All cells in the notebook are regular Julia cells. In Julia, semicolons are optional at the end of statements; we will use them at the end of some cells so that the value of the cell is not printed.
+# This
+# tutorial
+# uses a Jupyter notebook. All cells in the notebook are regular Julia cells.
+# In Julia, semicolons are optional at the end of statements; we will use them
+# at the end of some cells so that the value of the cell is not printed.
 
 a = 1 + 1
 
+# Output will not be printed:
+
 a = 1 + 1;
 
-# This notebook uses the [PyPlot](https://github.com/JuliaPy/PyPlot.jl) Julia package for plotting. PyPlot wraps the matplotlib Python package.
+# This notebook uses the [PyPlot](https://github.com/JuliaPy/PyPlot.jl) Julia
+# package for plotting. PyPlot wraps the matplotlib Python package. This cell
+# might take a few seconds to run.
 
 using PyPlot
 
-# This notebook will make use of Julia symbols. Note that a Julia symbol is different from a Julia string:
+# This notebook will make use of Julia symbols. Note that a Julia symbol is
+# different from a Julia string:
 
 typeof(:foo)
 
@@ -73,12 +119,60 @@ typeof("foo")
 # ## 2. Writing a probabilistic model as a generative function  <a name="writing-model"></a>
 
 # Probabilistic models are represented in Gen as *generative functions*.
-# The simplest way to construct a generative function is by using the [built-in modeling DSL](https://probcomp.github.io/Gen/dev/ref/modeling/). Generative functions written in the built-in modeling DSL are based on Julia function definition syntax, but are prefixed with the `@gen` keyword. The function represents the data-generating process we are modeling: each random choice it makes can be thought of as a random variable in the model.
-# The generative function below represents a probabilistic model of a linear relationship in the x-y plane. Given a set of $x$ coordinates, it randomly chooses a line in the plane and generates corresponding $y$ coordinates so that each $(x, y)$ is near the line. We might think of this function as modeling house prices as a function of square footage, or the measured volume of a gas as a function of its measured temperature.
+# Generative functions are used to represent a variety of different types of
+# probabilistic computations including generative models, inference models,
+# custom proposal distributions, and variational approximations (see the [Gen
+# documentation](https://probcomp.github.io/Gen/dev/ref/gfi/) and see the [Tech
+# report on
+# Gen](https://dspace.mit.edu/bitstream/handle/1721.1/119255/MIT-CSAIL-TR-2018-020.pdf?sequence=3)).
+#
+#
+#
+# The simplest way to construct a generative function is by using the [built-in
+# modeling DSL](https://probcomp.github.io/Gen/dev/ref/modeling/). Generative
+# functions written in the built-in modeling DSL are based on Julia function
+# definition syntax, but are prefixed with the `@gen` keyword. The function
+# represents the data-generating process we are modeling: each random choice it
+# makes can be thought of as a random variable in the model.
+
+# In probabilistic programming we treat probabilistic models as code and record random
+# choices. In Gen, we decorate random choices with the trace keyword. There are
+# two ways to do this:
+#
+# - using the `@trace` Gen keyword without an address: `@trace(<call>)`
+#
+# - using the `@trace` Gen keyword with an address: `@trace(<call>, <addr>)`
+#
+# In the above, `<call>` refers to an invocation of a generative function. A
+# simple example of such an invocation is a normal distribution parametrized
+# with mean 0 and standard deviation 1:
+# ```julia
+# my_variable = @trace(normal(0, 1))
+# ```
+# You can also supply a label (`<addr>` in the above) to the random choice (see
+# section 5 below for details and use cases for this capability):
+# ```julia
+# my_variable = @trace(normal(0, 1) :my_variable_name)
+# ```
+# `:my_variable_name` is a symbol in the Julia language.
+#
+# The following will _not_ work because the code is trying to trace the
+# expression `sin(x)` which is an invocation of an ordinary Julia function, not
+# a generative function.  (It is possible to convert `sin` to a generative
+# function, but so far we have not found any need to do so.)
+# ```Julia
+# my_variable = @trace(sin(1))
+# ```
+#
+
+# The generative function below represents a probabilistic model of a linear
+# relationship in the x-y plane. Given a set of $x$ coordinates, it randomly
+# chooses a line in the plane and generates corresponding $y$ coordinates so
+# that each $(x, y)$ is near the line. We might think of this function as
+# modeling house prices as a function of square footage, or the measured volume
+# of a gas as a function of its measured temperature.
 
 @gen function line_model(xs::Vector{Float64})
-    n = length(xs)
-    
     # We begin by sampling a slope and intercept for the line.
     # Before we have seen the data, we don't know the values of
     # these parameters, so we treat them as random choices. The
@@ -87,96 +181,132 @@ typeof("foo")
     # intercept will be more than a couple points away from 0.
     slope = @trace(normal(0, 1), :slope)
     intercept = @trace(normal(0, 2), :intercept)
-    
+
     # Given the slope and intercept, we can sample y coordinates
     # for each of the x coordinates in our input vector.
     for (i, x) in enumerate(xs)
         @trace(normal(slope * x + intercept, 0.1), (:y, i))
     end
-    
-    # The return value of the model is often not particularly important,
-    # Here, we simply return n, the number of points.
-    return n
+
+    # The return value of this model is not particularly important.
+    return :dummy_return_value
 end;
 
-# The generative function takes as an argument a vector of x-coordinates. We create one below:
+# The generative function takes as an argument a vector of x-coordinates. We
+# create one below:
 
 xs = [-5., -4., -3., -2., -1., 0., 1., 2., 3., 4., 5.];
 
-# Given this vector, the generative function samples a random choice representing the slope of a line from a normal distribution with mean 0 and standard deviation 1, and a random choice representing the intercept of a line from a normal distribution with mean 0 and standard deviation 2. In Bayesian statistics terms, these distributions are the *prior distributions* of the slope and intercept respectively. Then, the function samples values for the y-coordinates corresponding to each of the provided x-coordinates.
-
-# This generative function returns the number of data points. We can run the function like we run a regular Julia function:
-
-n = line_model(xs)
-println(n)
-
-# More interesting than `n` are the values of the random choices that `line_model` makes. **Crucially, each random choice is annotated with a unique *address*.** A random choice is assigned an address using the `@trace` keyword. Addresses can be any Julia value. In this program, there are two types of addresses used -- Julia symbols and tuples of symbols and integers. Note that within the `for` loop, the same line of code is executed multiple times, but each time, the random choice it makes is given a distinct address.
-
-# Although the random choices are not included in the return value, they *are* included in the *execution trace* of the generative function. We can run the generative function and obtain its trace using the [`
-# simulate`](https://probcomp.github.io/Gen/dev/ref/gfi/#Gen.simulate) method from the Gen API:
+# Given this vector, the generative function samples a random choice
+# representing the slope of a line from a normal distribution with mean 0 and
+# standard deviation 1, and a random choice representing the intercept of a
+# line from a normal distribution with mean 0 and standard deviation 2. In
+# Bayesian statistics terms, these distributions are the *prior distributions*
+# of the slope and intercept respectively. Then, the function samples values
+# for the y-coordinates corresponding to each of the provided x-coordinates.
+
+# This generative function returns the number of data points. We can run the
+# function like we run a regular Julia function:
+
+retval = line_model(xs)
+println(retval)
+
+# More interesting than `retval` are the values of the random choices that
+# `line_model` makes. **Crucially, each random choice is annotated with a
+# unique *address*.** A random choice is assigned an address using the `@trace`
+# keyword. Addresses can be any Julia value. In this program, there are two
+# types of addresses used -- Julia symbols and tuples of symbols and integers.
+# Note that within the `for` loop, the same line of code is executed multiple
+# times, but each time, the random choice it makes is given a distinct address.
+
+# Although the random choices are not included in the return value, they *are*
+# included in the *execution trace* of the generative function. We can run the
+# generative function and obtain its trace using the [`
+# simulate`](https://probcomp.github.io/Gen/dev/ref/gfi/#Gen.simulate) method
+# from the Gen API:
 
 trace = Gen.simulate(line_model, (xs,));
 
-# This method takes the function to be executed, and a tuple of arguments to the function, and returns a trace and a second value that we will not be using in this tutorial. When we print the trace, we see that it is a complex data structure.
+# This method takes the function to be executed, and a tuple of arguments to
+# the function, and returns a trace. When we print the trace, we see that it is
+# a complex data structure.
 
 println(trace)
 
-# A trace of a generative function contains various information about an execution of the function. For example, it contains the arguments on which the function was run, which are available with the API method `get_args`:
+# A trace of a generative function contains various information about an
+# execution of the function. For example, it contains the arguments on which
+# the function was run, which are available with the API method `get_args`:
 
 Gen.get_args(trace)
 
-# The trace also contains the value of the random choices, stored in map from address to value called a *choice map*. This map is available through the API method [`get_choices`]():
+# The trace also contains the value of the random choices, stored in map from
+# address to value called a *choice map*. This map is available through the API
+# method [`get_choices`]():
 
 println(Gen.get_choices(trace))
 
-# We can pull out individual values from this map using Julia's subscripting syntax `[...]`:
+# We can pull out individual values from this map using Julia's subscripting
+# syntax `[...]`:
 
 choices = Gen.get_choices(trace)
 println(choices[:slope])
 
-# We can also read the value of a random choice directly from the trace, without having to use `get_choices` first:
+# We can also read the value of a random choice directly from the trace,
+# without having to use `get_choices` first:
 
 println(trace[:slope])
 
-# The return value is also recorded in the trace, and is accessible with the `get_retval` API method:
+# The return value is also recorded in the trace, and is accessible with the
+# `get_retval` API method:
 
 println(Gen.get_retval(trace));
 
-# Or we can access the return value directly from the trace via the syntactic sugar `trace[]`:
+# Or we can access the return value directly from the trace via the syntactic
+# sugar `trace[]`:
 
 println(trace[])
 
-# In order to understand the probabilistic behavior of a generative function, it is helpful to be able to visualize its traces. Below, we define a function that uses PyPlot to render a trace of the generative function above. The rendering shows the x-y data points and the line that is represented by the slope and intercept choices.
+# In order to understand the probabilistic behavior of a generative function,
+# it is helpful to be able to visualize its traces. Below, we define a function
+# that uses PyPlot to render a trace of the generative function above. The
+# rendering shows the x-y data points and the line that is represented by the
+# slope and intercept choices.
+
+function render_trace(trace; show_data=true, limit_y=true)
 
-function render_trace(trace; show_data=true)
-    
     # Pull out xs from the trace
     xs = get_args(trace)[1]
-    
+
     xmin = minimum(xs)
     xmax = maximum(xs)
     if show_data
         ys = [trace[(:y, i)] for i=1:length(xs)]
-        
+
         # Plot the data set
         scatter(xs, ys, c="black")
     end
-    
+
     # Pull out slope and intercept from the trace
     slope = trace[:slope]
     intercept = trace[:intercept]
-    
+
     # Draw the line
     plot([xmin, xmax], slope *  [xmin, xmax] .+ intercept, color="black", alpha=0.5)
     ax = gca()
     ax.set_xlim((xmin, xmax))
-    ax.set_ylim((xmin, xmax))
+    if limit_y
+        ax.set_ylim((xmin, xmax))
+    end
 end;
 
 figure(figsize=(3,3))
 render_trace(trace);
+xlabel("X");
+ylabel("Y");
+title("Simulating a line and Y values\nfor a given vector (X)")
 
-# Because a generative function is stochastic, we need to visualize many runs in order to understand its behavior. The cell below renders a grid of traces.
+# Because a generative function is stochastic, we need to visualize many runs
+# in order to understand its behavior. The cell below renders a grid of traces.
 
 function grid(renderer::Function, traces; ncols=6, nrows=3)
     figure(figsize=(16, 8))
@@ -194,72 +324,122 @@ grid(render_trace, traces)
 # -------------------------------------------------
 # ### Exercise
 #
-# Write a generative function that uses the same address twice. Run it to see what happens.
+# Write a generative function that uses the same address twice. Run it to see
+# what happens.
 
 # ### Solution
 
-# -------------------------------
+# -------------------------------------------------
 # ### Exercise
 #
-# Write a model that generates a sine wave with random phase, period and amplitude, and then generates y-coordinates from a given vector of x-coordinates by adding noise to the value of the wave at each x-coordinate.
-# Use a  `gamma(5, 1)` prior distribution for the period, and a `gamma(1, 1)` prior distribution on the amplitude (see [`Gen.gamma`](https://probcomp.github.io/Gen/dev/ref/distributions/#Gen.gamma)). Use a uniform distribution for the phase (see [`Gen.uniform`](https://probcomp.github.io/Gen/dev/ref/distributions/#Gen.uniform)). Write a function that renders the trace by showing the data set and the sine wave. Visualize a grid of traces and discuss the distribution. Try tweaking the parameters of each of the prior distributions and seeing how the behavior changes.
+# Write a model that generates a sine wave with random phase, period and
+# amplitude, and then generates y-coordinates from a given vector of
+# x-coordinates by adding noise to the value of the wave at each x-coordinate.
+# Use a  `gamma(5, 1)` prior distribution for the period, and a `gamma(1, 1)`
+# prior distribution on the amplitude (see
+# [`Gen.gamma`](https://probcomp.github.io/Gen/dev/ref/distributions/#Gen.gamma)).
+# Sampling from a Gamma distribution will ensure to give us postive real values.
+# Use a uniform distribution between 0 and $2\pi$ for the phase (see
+# [`Gen.uniform`](https://probcomp.github.io/Gen/dev/ref/distributions/#Gen.uniform)).
+#
+# The sine wave should implement:
+#
+# $ y(x) = a \sin(2\pi \frac{1}{p} x + \varphi)$,
+#
+# where $a$ is the amplitude, $p$ is the period and $\varphi$ is the phase.  In
+# Julia the constant $\pi$ can be expressed as either `pi` or `π` (unicode).
+
+@assert π == pi;
+π
+
+# Write a function that renders the trace by showing the data set and the sine
+# wave. Visualize a grid of traces and discuss the distribution. Try tweaking
+# the parameters of each of the prior distributions and seeing how the behavior
+# changes.
 
 # We have provided you with some starter code for the sine wave model:
 
 # ### Solution
 
 @gen function sine_model(xs::Vector{Float64})
-    n = length(xs)
- 
     # < your code here >
- 
+
     for (i, x) in enumerate(xs)
         @trace(normal(0., 0.1), (:y, i)) # < edit this line >
     end
-    return n
+    return nothing  # change this return value if you want
 end;
 
-function render_sine_trace(trace; show_data=true)
+function render_sine_trace(trace; show_data=true, limit_y=true)
     xs = get_args(trace)[1]
-    xmin = minimum(xs)
-    xmax = maximum(xs)
     if show_data
         ys = [trace[(:y, i)] for i=1:length(xs)]
         scatter(xs, ys, c="black")
     end
-    
+
+    xmin = minimum(xs)
+    xmax = maximum(xs)
+
     # < your code here >
-    
+
     ax = gca()
     ax.set_xlim((xmin, xmax))
-    ax.set_ylim((xmin, xmax))
+    if limit_y
+        ax.set_ylim((xmin, xmax))
+    end
 end;
 
-traces = [Gen.simulate(sine_model, (xs,)) for _=1:12];
-
-figure(figsize=(16, 8))
-for (i, trace) in enumerate(traces)
-    subplot(3, 6, i)
-    render_sine_trace(trace)
-end
+# You can invoke the plotting function by running (the following is **not** a
+# code cell, it is a Markdown cell containing a code snippet; it won't run):
+# ```julia
+# traces = [Gen.simulate(sine_model, (xs,)) for _=1:12];
+# figure(figsize=(16, 8))
+# for (i, trace) in enumerate(traces)
+#     subplot(3, 6, i)
+#     render_sine_trace(trace)
+# end
+# ```
+
+# ### Plot your results.
+#
+# Plotting samples from a generative model implemented as probabilistic
+# programming is an important and helpful tool for debugging a model. Use the
+# function `render_sine_trace` (below) to plot samples from your answer above
+# to check whether you implemented the model correctly.
 
 # ## 3. Doing Posterior inference  <a name="doing-inference"></a>
 #
-# We now will provide a data set of y-coordinates and try to draw inferences about the process that generated the data. We begin with the following data set:
+# We now will provide a data set of y-coordinates and try to draw inferences
+# about the process that generated the data. We begin with the following data
+# set:
 
 ys = [6.75003, 6.1568, 4.26414, 1.84894, 3.09686, 1.94026, 1.36411, -0.83959, -0.976, -1.93363, -2.91303];
 
 figure(figsize=(3,3))
 scatter(xs, ys, color="black");
+xlabel("X");
+ylabel("Y");
+title("Oberved data (linear)");
 
-# We will assume that the line model was responsible for generating the data, and infer values of the slope and intercept that explain the data.
+# We will assume that the line model was responsible for generating the data,
+# and infer values of the slope and intercept that explain the data.
 #
-# To do this, we write a simple *inference program* that takes the model we are assuming generated our data, the data set, and the amount of computation to perform, and returns a trace of the function that is approximately sampled from the _posterior distribution_ on traces of the function, given the observed data. That is, the inference program will try to find a trace that well explains the dataset we created above. We can inspect that trace to find estimates of the slope and intercept of a line that fits the data.
+# To do this, we write a simple *inference program* that takes the model we are
+# assuming generated our data, the data set, and the amount of computation to
+# perform, and returns a trace of the function that is approximately sampled
+# from the _posterior distribution_ on traces of the function, given the
+# observed data. That is, the inference program will try to find a trace that
+# well explains the dataset we created above. We can inspect that trace to find
+# estimates of the slope and intercept of a line that fits the data.
 #
-# Functions like `importance_resampling` expect us to provide a _model_ and also an _choice map_ representing our data set and relating it to the model. A choice map maps random choice addresses from the model to values from our data set. Here, we want to tie model addresses like `(:y, 4)` to data set values like `ys[4]`:
+# Functions like `importance_resampling` expect us to provide a _model_ and
+# also an _choice map_ representing our data set and relating it to the model.
+# A choice map maps random choice addresses from the model to values from our
+# data set. Here, we want to tie model addresses like `(:y, 4)` to data set
+# values like `ys[4]`:
 
 function do_inference(model, xs, ys, amount_of_computation)
-    
+
     # Create a choice map that maps model addresses (:y, i)
     # to observed values ys[i]. We leave :slope and :intercept
     # unconstrained, because we want them to be inferred.
@@ -267,42 +447,59 @@ function do_inference(model, xs, ys, amount_of_computation)
     for (i, y) in enumerate(ys)
         observations[(:y, i)] = y
     end
-    
+
     # Call importance_resampling to obtain a likely trace consistent
     # with our observations.
     (trace, _) = Gen.importance_resampling(model, (xs,), observations, amount_of_computation);
     return trace
 end;
 
-# We can run the inference program to obtain a trace, and then visualize the result:
+# We can run the inference program to obtain a trace, and then visualize the
+# result:
 
 trace = do_inference(line_model, xs, ys, 100)
 figure(figsize=(3,3))
 render_trace(trace);
+xlabel("X");
+ylabel("Y");
+title("Oberved data and inferred line\n(one sample from the posterior)");
 
-# We see that `importance_resampling` found a reasonable slope and intercept to explain the data. We can also visualize many samples in a grid:
+# We see that `importance_resampling` found a reasonable slope and intercept to
+# explain the data. We can also visualize many samples in a grid:
 
 traces = [do_inference(line_model, xs, ys, 100) for _=1:10];
 grid(render_trace, traces)
 
-# We can see here that there is some uncertainty: with our limited data, we can't be 100% sure exactly where the line is. We can get a better sense for the variability in the posterior distribution by visualizing all the traces in one plot, rather than in a grid. Each trace is going to have the same observed data points, so we only plot those once, based on the values in the first trace:
+# We can see here that there is some uncertainty: with our limited data, we
+# can't be 100% sure exactly where the line is. We can get a better sense for
+# the variability in the posterior distribution by visualizing all the traces
+# in one plot, rather than in a grid. Each trace is going to have the same
+# observed data points, so we only plot those once, based on the values in the
+# first trace:
 
 function overlay(renderer, traces; same_data=true, args...)
-    renderer(traces[1], show_data=true, args...)
-    for i=2:length(traces)
-        renderer(traces[i], show_data=!same_data, args...)
+    if !isempty(traces)
+        renderer(traces[1], show_data=true, args...)
+        for i=2:length(traces)
+            renderer(traces[i], show_data=!same_data, args...)
+        end
     end
 end;
 
 traces = [do_inference(line_model, xs, ys, 100) for _=1:10];
 figure(figsize=(3,3))
 overlay(render_trace, traces);
+xlabel("X");
+ylabel("Y");
+title("Oberved data and posterior samples");
 
-# --------------
+# -------------------------------
 #
 # ### Exercise
 #
-# The results above were obtained for `amount_of_computation = 100`. Run the algorithm with this value set to `1`, `10`, and `1000`, etc.  Which value seems like a good tradeoff between accuracy and running time? Discuss.
+# The results above were obtained for `amount_of_computation = 100`. Run the
+# algorithm with this value set to `1`, `10`, and `1000`, etc.  Which value
+# seems like a good tradeoff between accuracy and running time? Discuss.
 
 # ### Solution
 
@@ -315,14 +512,32 @@ ys_sine = [2.89, 2.22, -0.612, -0.522, -2.65, -0.133, 2.70, 2.77, 0.425, -2.11,
 
 figure(figsize=(3, 3));
 scatter(xs, ys_sine, color="black");
+xlabel("X");
+ylabel("Y");
+title("Oberved data (sinusoidal)");
 
-# Write an inference program that generates traces of `sine_model` that explain this data set. Visualize the resulting distribution of traces. Temporarily change the prior distribution on the period to be `gamma(1, 1)`  (by changing and re-running the cell that defines `sine_model` from a previous exercise). Can you explain the difference in inference results when using `gamma(1, 1)` vs `gamma(5, 1)` prior on the period? How much computation did you need to get good results?
+# Write an inference program that generates traces of `sine_model` that explain
+# this data set. Visualize the resulting distribution of traces.
+#
+# Temporarily change the prior distribution on the period to be `gamma(1, 1)`
+# (by changing and re-running the cell that defines `sine_model` from a
+# previous exercise).
+# Can you explain the difference in inference results when using `gamma(1, 1)`
+# vs `gamma(5, 1)` prior on the period? How much computation did you need to
+# get good results?
 
 # ### Solution
 
 # ## 4. Predicting new data  <a name="predicting-data"></a>
 #
-# Using the API method [`generate`](https://probcomp.github.io/Gen/dev/ref/gfi/#Gen.generate), we can generate a trace of a generative function in which the values of certain random choices are constrained to given values. The constraints are a choice map that maps the addresses of the constrained random choices to their desired values. 
+# What if we'd want to predict `ys` given `xs`?
+#
+# Using the API method
+# [`generate`](https://probcomp.github.io/Gen/dev/ref/gfi/#Gen.generate), we
+# can generate a trace of a generative function in which the values of certain
+# random choices are constrained to given values. The constraints are a choice
+# map that maps the addresses of the constrained random choices to their
+# desired values.
 #
 # For example:
 
@@ -332,30 +547,50 @@ constraints[:intercept] = 0.
 (trace, _) = Gen.generate(line_model, (xs,), constraints)
 figure(figsize=(3,3))
 render_trace(trace);
-
-# Note that the random choices corresponding to the y-coordinates are still made randomly. Run the cell above a few times to verify this.
-
-# We will use the ability to run constrained executions of a generative function to predict the value of the y-coordinates at new x-coordinates by running new executions of the model generative function in which the random choices corresponding to the parameters have been constrained to their inferred values.  We have provided a function below (`predict_new_data`) that takes a trace, and a vector of new x-coordinates, and returns a vector of predicted y-coordinates corresponding to the x-coordinates in `new_xs`. We have designed this function to work with multiple models, so the set of parameter addresses is an argument (`param_addrs`):
+xlabel("X");
+ylabel("Y");
+title("Predictions for the vector xs\ngiven slope = 0 and intercept = 0")
+
+# Note that the random choices corresponding to the y-coordinates are still
+# made randomly. Run the cell above a few times to verify this.
+
+# We will use the ability to run constrained executions of a generative
+# function to predict the value of the y-coordinates at new x-coordinates by
+# running new executions of the model generative function in which the random
+# choices corresponding to the parameters have been constrained to their
+# inferred values.  We have provided a function below (`predict_new_data`) that
+# takes a trace, and a vector of new x-coordinates, and returns a vector of
+# predicted y-coordinates corresponding to the x-coordinates in `new_xs`. We
+# have designed this function to work with multiple models, so the set of
+# parameter addresses is an argument (`param_addrs`):
 
 function predict_new_data(model, trace, new_xs::Vector{Float64}, param_addrs)
-    
+
     # Copy parameter values from the inferred trace (`trace`)
     # into a fresh set of constraints.
     constraints = Gen.choicemap()
     for addr in param_addrs
         constraints[addr] = trace[addr]
     end
-    
+
     # Run the model with new x coordinates, and with parameters 
     # fixed to be the inferred values
     (new_trace, _) = Gen.generate(model, (new_xs,), constraints)
-    
+
     # Pull out the y-values and return them
     ys = [new_trace[(:y, i)] for i=1:length(new_xs)]
     return ys
 end;
 
-# The cell below defines a function that first performs inference on an observed data set `(xs, ys)`, and then runs `predict_new_data` to generate predicted y-coordinates. It repeats this process `num_traces` times, and returns a vector of the resulting y-coordinate vectors.
+# To illustrate, we call the function above given the previous trace (which
+# constrained slope and intercept to be zero).
+
+predict_new_data(line_model, trace, [1., 2., 3.], [:slope, :intercept])
+
+# The cell below defines a function that first performs inference on an
+# observed data set `(xs, ys)`, and then runs `predict_new_data` to generate
+# predicted y-coordinates. It repeats this process `num_traces` times, and
+# returns a vector of the resulting y-coordinate vectors.
 
 function infer_and_predict(model, xs, ys, new_xs, param_addrs, num_traces, amount_of_computation)
     pred_ys = []
@@ -366,7 +601,13 @@ function infer_and_predict(model, xs, ys, new_xs, param_addrs, num_traces, amoun
     pred_ys
 end;
 
-# Finally, we define a cell that plots the observed data set `(xs, ys)` as red dots, and the predicted data as small black dots.
+# To illustrate, we generate predictions at `[1., 2., 3.]` given one posterior
+# trace.
+
+pred_ys = infer_and_predict(line_model, xs, ys, [1., 2., 3.], [:slope, :intercept], 1, 1000)
+
+# Finally, we define a cell that plots the observed data set `(xs, ys)` as red
+# dots, and the predicted data as small black dots.
 
 function plot_predictions(xs, ys, new_xs, pred_ys)
     scatter(xs, ys, color="red")
@@ -375,12 +616,24 @@ function plot_predictions(xs, ys, new_xs, pred_ys)
     end
 end;
 
-# Recall the original dataset for the line model. The x-coordinates span the interval -5 to 5.
+# Recall the original dataset for the line model. The x-coordinates span the
+# interval -5 to 5.
 
 figure(figsize=(3,3))
 scatter(xs, ys, color="red");
-
-# We will use the inferred values of the parameters to predict y-coordinates for x-coordinates in the interval 5 to 10 from which data was not observed. We will also predict new data within the interval -5 to 5, and we will compare this data to the original observed data. Predicting new data from inferred parameters, and comparing this new data to the observed data is the core idea behind *posterior predictive checking*. This tutorial does not intend to give a rigorous overview behind techniques for checking the quality of a model, but intends to give high-level intuition.
+xlabel("X");
+ylabel("Y");
+title("Oberved data");
+
+# We will use the inferred values of the parameters to predict y-coordinates
+# for x-coordinates in the interval 5 to 10 from which data was not observed.
+# We will also predict new data within the interval -5 to 5, and we will
+# compare this data to the original observed data. Predicting new data from
+# inferred parameters, and comparing this new data to the observed data is the
+# core idea behind *posterior predictive checking*. This
+# tutorial
+# does not intend to give a rigorous overview behind techniques for checking
+# the quality of a model, but intends to give high-level intuition.
 
 new_xs = collect(range(-5, stop=10, length=100));
 
@@ -389,23 +642,37 @@ new_xs = collect(range(-5, stop=10, length=100));
 pred_ys = infer_and_predict(line_model, xs, ys, new_xs, [:slope, :intercept], 20, 1000)
 figure(figsize=(3,3))
 plot_predictions(xs, ys, new_xs, pred_ys)
+title("Oberved data (red)\nand predictions (black)");
+xlabel("X");
+ylabel("Y");
 
-# The results look reasonable, both within the interval of observed data and in the extrapolated predictions on the right.
+# The results look reasonable, both within the interval of observed data and in
+# the extrapolated predictions on the right.
 
-# Now consider the same experiment run with following data set, which has significantly more noise.
+# Now consider the same experiment run with following data set, which has
+# significantly more noise.
 
 ys_noisy = [5.092, 4.781, 2.46815, 1.23047, 0.903318, 1.11819, 2.10808, 1.09198, 0.0203789, -2.05068, 2.66031];
 
 pred_ys = infer_and_predict(line_model, xs, ys_noisy, new_xs, [:slope, :intercept], 20, 1000)
 figure(figsize=(3,3))
 plot_predictions(xs, ys_noisy, new_xs, pred_ys)
-
-# It looks like the generated data is less noisy than the observed data in the regime where data was observed, and it looks like the forecasted data is too overconfident. This is a sign that our model is mis-specified. In our case, this is because we have assumed that the noise has value 0.1. However, the actual noise in the data appears to be much larger. We can correct this by making the noise a random choice as well and inferring its value along with the other parameters.
-
-# We first write a new version of the line model that samples a random choice for the noise from a `gamma(1, 1)` prior distribution.
-
-@gen function line_model_2(xs::Vector{Float64})
-    n = length(xs)
+title("Oberved data (red)\nand predictions (black)");
+xlabel("X");
+ylabel("Y");
+
+# It looks like the generated data is less noisy than the observed data in the
+# regime where data was observed, and it looks like the forecasted data is too
+# overconfident. This is a sign that our model is mis-specified. In our case,
+# this is because we have assumed that the noise has value 0.1. However, the
+# actual noise in the data appears to be much larger. We can correct this by
+# making the noise a random choice as well and inferring its value along with
+# the other parameters.
+
+# We first write a new version of the line model that samples a random choice
+# for the noise from a `gamma(1, 1)` prior distribution.
+
+@gen function line_model_fancy(xs::Vector{Float64})
     slope = @trace(normal(0, 1), :slope)
     intercept = @trace(normal(0, 2), :intercept)
     noise = @trace(gamma(1, 1), :noise)
@@ -415,7 +682,8 @@ plot_predictions(xs, ys_noisy, new_xs, pred_ys)
     return nothing
 end;
 
-# Then, we compare the predictions using inference the unmodified and modified model on the `ys` data set:
+# Then, we compare the predictions using inference the unmodified and modified
+# model on the `ys` data set:
 
 # +
 figure(figsize=(6,3))
@@ -425,15 +693,17 @@ subplot(1, 2, 1)
 title("Fixed noise level")
 plot_predictions(xs, ys, new_xs, pred_ys)
 
-pred_ys = infer_and_predict(line_model_2, xs, ys, new_xs, [:slope, :intercept, :noise], 20, 10000)
+pred_ys = infer_and_predict(line_model_fancy, xs, ys, new_xs, [:slope, :intercept, :noise], 20, 10000)
 subplot(1, 2, 2)
 title("Inferred noise level")
 plot_predictions(xs, ys, new_xs, pred_ys)
 # -
 
-# Notice that there is more uncertainty in the predictions made using the modified model.
+# Notice that there is more uncertainty in the predictions made using the
+# modified model.
 #
-# We also compare the predictions using inference the unmodified and modified model on the `ys_noisy` data set:
+# We also compare the predictions using inference the unmodified and modified
+# model on the `ys_noisy` data set:
 
 # +
 figure(figsize=(6,3))
@@ -443,7 +713,7 @@ subplot(1, 2, 1)
 title("Fixed noise level")
 plot_predictions(xs, ys_noisy, new_xs, pred_ys)
 
-pred_ys = infer_and_predict(line_model_2, xs, ys_noisy, new_xs, [:slope, :intercept, :noise], 20, 10000)
+pred_ys = infer_and_predict(line_model_fancy, xs, ys_noisy, new_xs, [:slope, :intercept, :noise], 20, 10000)
 subplot(1, 2, 2)
 title("Inferred noise level")
 plot_predictions(xs, ys_noisy, new_xs, pred_ys)
@@ -454,21 +724,25 @@ plot_predictions(xs, ys_noisy, new_xs, pred_ys)
 # -------------------------
 # ### Exercise
 #
-# Write a modified version the sine model that makes noise into a random choice. Compare the predicted data with the observed data `infer_and_predict` and `plot_predictions` for the unmodified and modified model, and for the `ys_sine` and `ys_noisy` datasets. Discuss the results. Experiment with the amount of inference computation used. The amount of inference computation will need to be higher for the model with the noise random choice.
+# Write a modified version the sine model that makes noise into a random
+# choice. Compare the predicted data with the observed data `infer_and_predict`
+# and `plot_predictions` for the unmodified and modified model, and for the
+# `ys_sine` and `ys_noisy` datasets. Discuss the results. Experiment with the
+# amount of inference computation used. The amount of inference computation
+# will need to be higher for the model with the noise random choice.
 #
 # We have provided you with starter code:
 
 # ### Solution
 
-@gen function sine_model_2(xs::Vector{Float64})
-    n = length(xs)
-    
+@gen function sine_model_fancy(xs::Vector{Float64})
+
     # < your code here >
-    
+
     for (i, x) in enumerate(xs)
         @trace(normal(0., 0.1), (:y, i)) # < edit this line >
     end
-    return n
+    return nothing
 end;
 
 # +
@@ -482,7 +756,7 @@ title("Fixed noise level")
 plot_predictions(xs, ys_sine, new_xs, pred_ys)
 
 # Modify the line below>
-pred_ys = infer_and_predict(sine_model_2, xs, ys_sine, new_xs, [], 20, 1)
+pred_ys = infer_and_predict(sine_model_fancy, xs, ys_sine, new_xs, [], 20, 1)
 
 subplot(1, 2, 2)
 title("Inferred noise level")
@@ -499,7 +773,7 @@ title("Fixed noise level")
 plot_predictions(xs, ys_noisy, new_xs, pred_ys)
 
 # Modify the line below>
-pred_ys = infer_and_predict(sine_model_2, xs, ys_noisy, new_xs, [], 20, 1)
+pred_ys = infer_and_predict(sine_model_fancy, xs, ys_noisy, new_xs, [], 20, 1)
 
 subplot(1, 2, 2)
 title("Inferred noise level")
@@ -508,7 +782,11 @@ plot_predictions(xs, ys_noisy, new_xs, pred_ys)
 
 # ## 5. Calling other generative functions  <a name="calling-functions"></a>
 #
-# In addition to making random choices, generative functions can invoke other generative functions. To illustrate this, we will write a probabilistic model that combines the line model and the sine model. This model is able to explain data using either model, and which model is chosen will depend on the data. This is called *model selection*.
+# In addition to making random choices, generative functions can invoke other
+# generative functions. To illustrate this, we will write a probabilistic model
+# that combines the line model and the sine model. This model is able to
+# explain data using either model, and which model is chosen will depend on the
+# data. This is called *model selection*.
 
 # A generative function can invoke another generative function in three ways:
 #
@@ -518,7 +796,12 @@ plot_predictions(xs, ys_noisy, new_xs, pred_ys)
 #
 # - using the `@trace` Gen keyword without an address: `@trace(<call>)`
 #
-# When invoking using regular function call syntax, the random choices made by the callee function are not traced. When invoking using `@trace` without an address, the random choices of the callee function are placed in the same address namespace as the caller's random choices. When using `@trace(<call>, <addr>)`, the random choices of the callee are placed under the namespace `<addr>`.
+# When invoking using regular function call syntax, the random choices made by
+# the callee function are not traced. When invoking using `@trace` without an
+# address, the random choices of the callee function are placed in the same
+# address namespace as the caller's random choices. When using `@trace(<call>,
+# <addr>)`, the random choices of the callee are placed under the namespace
+# `<addr>`.
 
 # +
 @gen function foo()
@@ -546,17 +829,22 @@ println(Gen.get_choices(trace))
 trace = Gen.simulate(bar_using_namespace, ())
 println(Gen.get_choices(trace))
 
-# Using `@trace` with a namespace can help avoid address collisions for complex models.
+# Using `@trace` with a namespace can help avoid address collisions for complex
+# models.
 
-# A hierarchical address is represented as a Julia `Pair`, where the first element of the pair is the first element of the address and the second element of the pair is the rest of the address:
+# A hierarchical address is represented as a Julia `Pair`, where the first
+# element of the pair is the first element of the address and the second
+# element of the pair is the rest of the address:
 
 trace[Pair(:z, :y)]
 
-# Julia uses the `=>` operator as a shorthand for the `Pair` constructor, so we can access choices at hierarchical addresses like:
+# Julia uses the `=>` operator as a shorthand for the `Pair` constructor, so we
+# can access choices at hierarchical addresses like:
 
 trace[:z => :y]
 
-# If we have a hierarchical address with more than two elements, we can construct the address by chaining the `=>` operator:
+# If we have a hierarchical address with more than two elements, we can
+# construct the address by chaining the `=>` operator:
 
 # +
 @gen function baz()
@@ -572,13 +860,15 @@ trace[:a => :z => :y]
 
 trace[Pair(:a, Pair(:z, :y))]
 
-# Now, we write a generative function that combines the line and sine models. It makes a Bernoulli random choice (e.g. a coin flip that returns true or false) that determines which of the two models will generate the data.
+# Now, we write a generative function that combines the line and sine models.
+# It makes a Bernoulli random choice (e.g. a coin flip that returns true or
+# false) that determines which of the two models will generate the data.
 
 @gen function combined_model(xs::Vector{Float64})
     if @trace(bernoulli(0.5), :is_line)
-        @trace(line_model_2(xs))
+        @trace(line_model_fancy(xs))
     else
-        @trace(sine_model_2(xs))
+        @trace(sine_model_fancy(xs))
     end
 end;
 
@@ -592,39 +882,61 @@ function render_combined(trace; show_data=true)
     end
 end;
 
-# We visualize some traces, and see that sometimes it samples linear data and other times sinusoidal data.
+# We visualize some traces, and see that sometimes it samples linear data and
+# other times sinusoidal data.
 
 traces = [Gen.simulate(combined_model, (xs,)) for _=1:12];
 grid(render_combined, traces)
 
-# We run inference using this combined model on the `ys` data set and the `ys_sine` data set. 
+# We run inference using this combined model on the `ys` data set and the
+# `ys_sine` data set.
 
 figure(figsize=(6,3))
 subplot(1, 2, 1)
 traces = [do_inference(combined_model, xs, ys, 10000) for _=1:10];
 overlay(render_combined, traces)
+title("Posterior given\nlinear observations");
+xlabel("X");
+ylabel("Y");
 subplot(1, 2, 2)
 traces = [do_inference(combined_model, xs, ys_sine, 10000) for _=1:10];
 overlay(render_combined, traces)
+title("Posterior given\nsinusoidal observations");
+xlabel("X");
+ylabel("Y");
 
-# The results should show that the line model was inferred for the `ys` data set, and the sine wave model was inferred for the `ys_sine` data set.
+# The results should show that the line model was inferred for the `ys` data
+# set, and the sine wave model was inferred for the `ys_sine` data set.
 
 # -------
 #
 # ### Exercise
 #
-# Construct a data set for which it is ambiguous whether the line or sine wave model is best. Visualize the inferred traces using `render_combined` to illustrate the ambiguity. Write a program that takes the data set and returns an estimate of the posterior probability that the data was generated by the sine wave model, and run it on your data set.
+# Construct a data set for which it is ambiguous whether the line or sine wave
+# model is best. Visualize the inferred traces using `render_combined` to
+# illustrate the ambiguity. Write a program that takes the data set and returns
+# an estimate of the posterior probability that the data was generated by the
+# sine wave model, and run it on your data set.
 #
-# Hint: To estimate the posterior probability that the data was generated by the sine wave model, run the inference program many times to compute a large number of traces, and then compute the fraction of those traces in which `:is_line` is false.
+# Hint: To estimate the posterior probability that the data was generated by
+# the sine wave model, run the inference program many times to compute a large
+# number of traces, and then compute the fraction of those traces in which
+# `:is_line` is false.
 
 # ### Solution
 
 # ------
 # ### Exercise 
 #
-# There is code that is duplicated between `line_model_2` and `sine_model_2`. Refactor the model to reduce code duplication and improve the readability of the code. Re-run the experiment above and confirm that the results are qualitatively the same. You may need to write a new rendering function. Try to avoid introducing code duplication between the model and the rendering code.
+# There is code that is duplicated between `line_model_fancy` and
+# `sine_model_fancy`.  Refactor the model to reduce code duplication and
+# improve the readability of the code. Re-run the experiment above and confirm
+# that the results are qualitatively the same. You may need to write a new
+# rendering function. Try to avoid introducing code duplication between the
+# model and the rendering code.
 #
-# Hint: To avoid introducing code duplication between the model and the rendering code, use the return value of the generative function.
+# Hint: To avoid introducing code duplication between the model and the
+# rendering code, use the return value of the generative function.
 
 # ### Solution
 
@@ -650,7 +962,7 @@ function render_combined_refactored(trace; show_data=true)
     end
 
     # < your code here >
-    
+
     ax = gca()
     ax.set_xlim((xmin, xmax))
     ax.set_ylim((xmin, xmax))
@@ -667,13 +979,18 @@ overlay(render_combined_refactored, traces)
 
 # ## 6. Modeling with an unbounded number of parameters  <a name="infinite-space"></a>
 
-# Gen's built-in modeling language can be used to express models that use an unbounded number of parameters. This section walks you through development of a model of data that does not a-priori specify an upper bound on the complexity of the model, but instead infers the complexity of the model as well as the parameters. This is a simple example of a *Bayesian nonparametric* model.
+# Gen's built-in modeling language can be used to express models that use an
+# unbounded number of parameters. This section walks you through development of
+# a model of data that does not a-priori specify an upper bound on the
+# complexity of the model, but instead infers the complexity of the model as
+# well as the parameters. This is a simple example of a *Bayesian
+# nonparametric* model.
 
 # We will consider two data sets:
 
 xs_dense = collect(range(-5, stop=5, length=50))
 ys_simple = fill(1., length(xs_dense)) .+ randn(length(xs_dense)) * 0.1
-ys_complex = [Int(floor(abs(x/3))) % 2 == 0 ? 2 : 0 for x in xs_dense] .+ randn(length(xs_dense)) * 0.1;
+ys_complex = [Int(floor(abs(x/3))) % 2 == 0 ? 2 : 0 for x in xs_dense] .+ randn(length(xs_dense)) * 0.05;
 
 # +
 figure(figsize=(6,3))
@@ -689,7 +1006,13 @@ scatter(xs_dense, ys_complex, color="black", s=10)
 gca().set_ylim((-1, 3))
 # -
 
-# The data set on the left appears to be best explained as a contant function with some noise. The data set on the right appears to include two changepoints, with a constant function in between the changepoints. We want a model that does not a-priori choose the number of changepoints in the data. To do this, we will recursively partition the interval into regions. We define a Julia data structure that represents a binary tree of intervals; each leaf node represents a region in which the function is constant.
+# The data set on the left appears to be best explained as a contant function
+# with some noise. The data set on the right appears to include two
+# changepoints, with a constant function in between the changepoints. We want a
+# model that does not a-priori choose the number of changepoints in the data.
+# To do this, we will recursively partition the interval into regions. We
+# define a Julia data structure that represents a binary tree of intervals;
+# each leaf node represents a region in which the function is constant.
 
 struct Interval
     l::Float64
@@ -698,7 +1021,7 @@ end
 
 # +
 abstract type Node end
-    
+
 struct InternalNode <: Node
     left::Node
     right::Node
@@ -711,7 +1034,11 @@ struct LeafNode <: Node
 end
 # -
 
-# We now write a generative function that randomly creates such a tree. Note the use of recursion in this function to create arbitrarily large trees representing arbitrarily many changepoints. Also note that we assign the address namespaces `:left` and `:right` to the calls made for the two recursive calls to `generate_segments`.
+# We now write a generative function that randomly creates such a tree. Note
+# the use of recursion in this function to create arbitrarily large trees
+# representing arbitrarily many changepoints. Also note that we assign the
+# address namespaces `:left` and `:right` to the calls made for the two
+# recursive calls to `generate_segments`.
 
 @gen function generate_segments(l::Float64, u::Float64)
     interval = Interval(l, u)
@@ -727,7 +1054,8 @@ end
     end
 end;
 
-# We also define some helper functions to visualize traces of the `generate_segments` function.
+# We also define some helper functions to visualize traces of the
+# `generate_segments` function.
 
 # +
 function render_node(node::LeafNode)
@@ -748,14 +1076,23 @@ function render_segments_trace(trace)
     ax.set_ylim((-3, 3))
 end;
 
-# We generate 12 traces from this function and visualize them below. We plot the piecewise constant function that was sampled by each run of the generative function. Different constant segments are shown in different colors. Run the cell a few times to get a better sense of the distribution on functions that is represented by the generative function.
+# We generate 12 traces from this function and visualize them below. We plot
+# the piecewise constant function that was sampled by each run of the
+# generative function. Different constant segments are shown in different
+# colors. Run the cell a few times to get a better sense of the distribution on
+# functions that is represented by the generative function.
 
 traces = [Gen.simulate(generate_segments, (0., 1.)) for i=1:12]
 grid(render_segments_trace, traces)
+suptitle("Traces simulated from the prior");
 
-# Because we only sub-divide an interval with 30% probability, most of these sampled traces have only one segment.
+# Because we only sub-divide an interval with 30% probability, most of these
+# sampled traces have only one segment.
 
-# Now that we have generative function that generates a random piecewise-constant function, we write a model that adds noise to the resulting constant functions to generate a data set of y-coordinates. The noise level will be a random choice.
+# Now that we have generative function that generates a random
+# piecewise-constant function, we write a model that adds noise to the
+# resulting constant functions to generate a data set of y-coordinates. The
+# noise level will be a random choice.
 
 # +
 # get_value_at searches a binary tree for
@@ -777,7 +1114,7 @@ end
 # Out full model
 @gen function changepoint_model(xs::Vector{Float64})
     node = @trace(generate_segments(minimum(xs), maximum(xs)), :tree)
-    noise = @trace(gamma(1, 1), :noise)
+    noise = @trace(gamma(0.5, 0.5), :noise)
     for (i, x) in enumerate(xs)
         @trace(normal(get_value_at(x, node), noise), (:y, i))
     end
@@ -800,21 +1137,36 @@ function render_changepoint_model_trace(trace; show_data=true)
     ax.set_ylim((-3, 3))
 end;
 
-# Finally, we generate some simulated data sets and visualize them on top of the underlying piecewise constant function from which they were generated:
+# Finally, we generate some simulated data sets and visualize them on top of
+# the underlying piecewise constant function from which they were generated:
 
 traces = [Gen.simulate(changepoint_model, (xs_dense,)) for i=1:12]
 grid(render_changepoint_model_trace, traces)
 
-# Notice that the amount of variability around the piecewise constant mean function differs from trace to trace.
+# Notice that the amount of variability around the piecewise constant mean
+# function differs from trace to trace.
 
 # Now we perform inference for the simple data set:
 
 traces = [do_inference(changepoint_model, xs_dense, ys_simple, 10000) for _=1:12];
 grid(render_changepoint_model_trace, traces)
 
-# We see that we inferred that the mean function that explains the data is a constant with very high probability.
-
-# For inference about the complex data set, we use more computation. You can experiment with different amounts of computation to see how the quality of the inferences degrade with less computation. Note that we are using a very simple generic inference algorithm in this tutorial, which really isn't suited for this more complex task. In later tutorials, we will learn how to write more efficient algorithms, so that accurate results can be obtained with significantly less computation. We will also see ways of annotating the model for better performance, no matter the inference algorithm.
+# We see that we inferred that the mean function that explains the data is a
+# constant with very high probability.
+
+# For inference about the complex data set, we use more computation. You can
+# experiment with different amounts of computation to see how the quality of
+# the inferences degrade with less computation. Note that we are using a very
+# simple generic inference algorithm in this
+# tutorial,
+# which really isn't suited for this more complex task. In later
+# tutorials,
+# we will learn how to write more efficient algorithms, so that accurate
+# results can be obtained with significantly less computation. We will also see
+# ways of annotating the model for better performance, no matter the inference
+# algorithm.
+#
+# ##### Caveat: the following cell will run for 2-3 minutes.
 
 traces = [do_inference(changepoint_model, xs_dense, ys_complex, 100000) for _=1:12];
 grid(render_changepoint_model_trace, traces)
@@ -823,18 +1175,22 @@ grid(render_changepoint_model_trace, traces)
 
 # ------
 # ### Exercise
-# Write a function that takes a data set of x- and y-coordinates and plots the histogram of the probability distribution on the number of changepoints.
+# Write a function that takes a data set of x- and y-coordinates and plots the
+# histogram of the probability distribution on the number of changepoints.
 # Show the results for the `ys_simple` and `ys_complex` data sets.
 #
-# Hint: The return value of `changepoint_model` is the tree of `Node` values. Walk this tree.
+# Hint: The return value of `changepoint_model` is the tree of `Node` values.
+# Walk this tree.
 
 # ### Solution
 
 # -------
 #
 # ### Exercise
-# Write a new version of `changepoint_model` that uses `@trace` without an address (e.g. `@trace(<call>)`) to make the recursive calls.
+# Write a new version of `changepoint_model` that uses `@trace` without an
+# address (e.g. `@trace(<call>)`) to make the recursive calls.
 #
-# Hint: You will need to guarantee that all addresses are unique. How can you label each node in a binary tree using an integer?
+# Hint: You will need to guarantee that all addresses are unique. How can you
+# label each node in a binary tree using an integer?
 
 # ### Solution
diff --git a/tutorials/Iterative inference in Gen.jl b/tutorials/Iterative inference in Gen.jl
index 3cea46a..9efbde9 100644
--- a/tutorials/Iterative inference in Gen.jl	
+++ b/tutorials/Iterative inference in Gen.jl	
@@ -15,7 +15,10 @@
 
 # # Tutorial: Basics of Iterative Inference Programming in Gen
 
-# This tutorial introduces the basics of inference programming in Gen using iterative inference programs, which include Markov chain Monte Carlo algorithms.
+# This
+# tutorial
+# introduces the basics of inference programming in Gen using iterative
+# inference programs, which include Markov chain Monte Carlo algorithms.
 
 # ## The task: curve-fitting with outliers
 
@@ -48,11 +51,14 @@ import Random
 
 # ## 1. Writing the model: a first attempt  <a name="writing-model"></a>
 
-# We begin, as usual, by writing a model: a Julia function responsible (conceptually) for simulating a fake dataset.
+# We begin, as usual, by writing a model: a Julia function responsible
+# (conceptually) for simulating a synthetic dataset.
 #
-# Our model will take as input a vector of `x` coordinates, and produce as output corresponding `y` coordinates. A simple approach to writing this model might look something like this:
+# Our model will take as input a vector of `x` coordinates, and produce as
+# output corresponding `y` coordinates. A simple approach to writing this model
+# might look something like this:
 
-@gen function model(xs::Vector{Float64})
+@gen function regression_with_outliers(xs::Vector{Float64})
     # First, generate some parameters of the model. We make these
     # random choices, because later, we will want to infer them
     # from data. The distributions we use here express our assumptions
@@ -83,7 +89,15 @@ import Random
     ys
 end;
 
-# This model does what we want: it samples several parameters of the data-generating process, then generates data accordingly.
+# This model does what we want: it samples several parameters of the
+# data-generating process, then generates data accordingly.
+# And for now, it will do.
+
+using PyPlot  # will take a few seconds to run.
+
+# #### Observed data
+#
+# Will generate observations (both inliers and outliers) synthetically.
 
 true_inlier_noise = 0.5
 true_outlier_noise = 10.
@@ -102,10 +116,18 @@ for (i, x) in enumerate(xs)
 end
 ys[end-3] = 14
 ys[end-5] = 13
+scatter(xs, ys, color="black")
+xlabel("X")
+ylabel("Y")
+title("Observations - regular data and outliers");
 
 # ## 2. What our model is doing: visualizing the prior 
 
-# Let's visualize what our model is doing by drawing some samples from the prior. First, we'll need to write a function that serializes a trace for use by the GenViz library. Here, we make available the slope, intercept, noise level, and outlier classifications for each point, as these are the things that vary from trace to trace while doing inference.
+# Let's visualize what our model is doing by drawing some samples from the
+# prior. First, we'll need to write a function that serializes a trace for use
+# by the GenViz library. Here, we make available the slope, intercept, noise
+# level, and outlier classifications for each point, as these are the things
+# that vary from trace to trace while doing inference.
 
 # +
 using GenViz
@@ -125,6 +147,8 @@ end;
 
 server = VizServer(8091);
 
+# ##### Warning: do not execute the above cell twice (restart the kernel if you do).
+
 # Finally, we generate some data and draw it:
 
 # +
@@ -134,7 +158,7 @@ viz = Viz(server, joinpath(@__DIR__, "regression-viz/dist"), [xs])
 
 # Generate ten traces and draw them into the visualization
 for i=1:10
-    (trace, _) = Gen.generate(model, (xs,))
+    (trace, _) = Gen.generate(regression_with_outliers, (xs,))
     ys = Gen.get_retval(trace)
     putTrace!(viz, "t$(i)", serialize_trace(trace))
 end
@@ -143,11 +167,22 @@ end
 displayInNotebook(viz)
 # -
 
-# Note that an outlier can occur anywhere — including close to the line — and that our model is capable of generating datasets in which the vast majority of points are outliers.
+# ##### Legend:
+#
+# * red points: outliers;
+# * blue points: inliers (i.e. regular data);
+# * dark grey shading: noise associated with inliers; and
+# * light grey shading: noise associated with outliers.
+#
+# Note that an outlier can occur anywhere — including close to the line — and
+# that our model is capable of generating datasets in which the vast majority
+# of points are outliers.
 
 # ## 3. The problem with generic importance sampling  <a name="generic-importance"></a>
 
-# To motivate the need for more complex inference algorithms, let's begin by using the simple importance sampling method from the previous tutorial, and thinking about where it fails.
+# To motivate the need for more complex inference algorithms, let's begin by
+# using the simple importance sampling method from the previous tutorial, and
+# thinking about where it fails.
 #
 # First, let us create a synthetic dataset to do inference _about_.
 
@@ -173,9 +208,18 @@ function make_synthetic_dataset(n)
 end
     
 (xs, ys) = make_synthetic_dataset(20);
+scatter(xs, ys, color="black")
+xlabel("X")
+ylabel("Y")
+title("Observations - regular data and outliers");
 # -
 
-# In Gen, we express our _observations_ as an _Assignment_ that constrains the values of certain random choices to equal their observed values. Here, we want to constrain the values of the choices with address `:data => i => :y` (that is, the sampled $y$ coordinates) to equal the observed $y$ values. Let's write a helper function that takes in a vector of $y$ values and creates an Assignment that we can use to constrain our model:
+# In Gen, we express our _observations_ as a `ChoiceMap` that constrains the
+# values of certain random choices to equal their observed values. Here, we
+# want to constrain the values of the choices with address `:data => i => :y`
+# (that is, the sampled $y$ coordinates) to equal the observed $y$ values.
+# Let's write a helper function that takes in a vector of $y$ values and
+# creates a `ChoiceMap` that we can use to constrain our model:
 
 function make_constraints(ys::Vector{Float64})
     constraints = Gen.choicemap()
@@ -185,11 +229,13 @@ function make_constraints(ys::Vector{Float64})
     constraints
 end;
 
-# We can apply it to our dataset's vector of `ys` to make a set of constraints for doing inference:
+# We can apply it to our dataset's vector of `ys` to make a set of constraints
+# for doing inference:
 
 observations = make_constraints(ys);
 
-# Now, we use the library function `importance_resampling` to draw approximate posterior samples given those observations:
+# Now, we use the library function `importance_resampling` to draw approximate
+# posterior samples given those observations:
 
 function logmeanexp(scores)
     logsumexp(scores) - log(length(scores))
@@ -198,45 +244,83 @@ end
 viz = Viz(server, joinpath(@__DIR__, "regression-viz/dist"), [xs])
 log_probs = Vector{Float64}(undef, 10)
 for i=1:10
-    (tr, _) = Gen.importance_resampling(model, (xs,), observations, 2000)
+    (tr, _) = Gen.importance_resampling(regression_with_outliers, (xs,), observations, 2000)
     putTrace!(viz, "t$(i)", serialize_trace(tr))
     log_probs[i] = Gen.get_score(tr)
 end
 displayInNotebook(viz)
 println("Average log probability: $(logmeanexp(log_probs))")
 
-# We see here that importance sampling hasn't completely failed: it generally finds a reasonable position for the line. But the details are off: there is little logic to the outlier classification, and the inferred noise around the line is too wide. The problem is that there are just too many variables to get right, and so sampling everything in one go is highly unlikely to produce a perfect hit.
+# We see here that importance sampling hasn't completely failed: it generally
+# finds a reasonable position for the line. But the details are off: there is
+# little logic to the outlier classification, and the inferred noise around the
+# line is too wide. The problem is that there are just too many variables to
+# get right, and so sampling everything in one go is highly unlikely to produce
+# a perfect hit.
 #
-# In the remainder of this notebook, we'll explore techniques for finding the right solution _iteratively_, beginning with an initial guess and making many small changes, until we achieve a reasonable posterior sample.
+# In the remainder of this notebook, we'll explore techniques for finding the
+# right solution _iteratively_, beginning with an initial guess and making many
+# small changes, until we achieve a reasonable posterior sample.
 
 # ## 4. MCMC Inference Part 1: Block Resimulation  <a name="mcmc-1"></a>
 
 # ### What is MCMC?
 
-# _Markov Chain Monte Carlo_ ("MCMC") methods are a powerful family of algorithms for iteratively producing approximate samples from a distribution (when applied to Bayesian inference problems, the posterior distribution of unknown (hidden) model variables given data).
-#
-# There is a rich theory behind MCMC methods, but we focus on applying MCMC in Gen and introducing theoretical ideas only when necessary for understanding. As we will see, Gen provides abstractions that hide and automate much of the math necessary for implementing MCMC algorithms correctly.
-#
-# The general shape of an MCMC algorithm is as follows. We begin by sampling an intial setting of all unobserved variables; in Gen, we produce an initial _trace_ consistent with (but not necessarily _probable_ given) our observations. Then, in a long-running loop, we make small, stochastic changes to the trace; in order for the algorithm to be asymptotically correct, these stochastic updates must satisfy certain probabilistic properties. 
-#
-# One common way of ensuring that the updates do satisfy those properties is to compute a _Metropolis-Hastings acceptance ratio_. Essentially, after proposing a change to a trace, we add an "accept or reject" step that stochastically decides whether to commit the update or to revert it. This is an over-simplification, but generally speaking, this step ensures we are more likely to accept changes that make our trace fit the observed data better, and to reject ones that make our current trace worse. The algorithm also tries not to go down dead ends: it is more likely to take an exploratory step into a low-probability region if it knows it can easily get back to where it came from.
-#
-# Gen's `metropolis_hastings` function _automatically_ adds this "accept/reject" check (including the correct computation of the probability of acceptance or rejection), so that as inference programmers, we need only think about what sorts of updates might be useful to propose. Starting in this section, we'll look at several design patterns for MCMC updates, and how to apply them in Gen.
+# _Markov Chain Monte Carlo_ ("MCMC") methods are a powerful family of
+# algorithms for iteratively producing approximate samples from a distribution
+# (when applied to Bayesian inference problems, the posterior distribution of
+# unknown (hidden) model variables given data).
+#
+# There is a rich theory behind MCMC methods, but we focus on applying MCMC in
+# Gen and introducing theoretical ideas only when necessary for understanding.
+# As we will see, Gen provides abstractions that hide and automate much of the
+# math necessary for implementing MCMC algorithms correctly.
+#
+# The general shape of an MCMC algorithm is as follows. We begin by sampling an
+# intial setting of all unobserved variables; in Gen, we produce an initial
+# _trace_ consistent with (but not necessarily _probable_ given) our
+# observations. Then, in a long-running loop, we make small, stochastic changes
+# to the trace; in order for the algorithm to be asymptotically correct, these
+# stochastic updates must satisfy certain probabilistic properties.
+#
+# One common way of ensuring that the updates do satisfy those properties is to
+# compute a _Metropolis-Hastings acceptance ratio_. Essentially, after
+# proposing a change to a trace, we add an "accept or reject" step that
+# stochastically decides whether to commit the update or to revert it. This is
+# an over-simplification, but generally speaking, this step ensures we are more
+# likely to accept changes that make our trace fit the observed data better,
+# and to reject ones that make our current trace worse. The algorithm also
+# tries not to go down dead ends: it is more likely to take an exploratory step
+# into a low-probability region if it knows it can easily get back to where it
+# came from.
+#
+# Gen's `metropolis_hastings` function _automatically_ adds this
+# "accept/reject" check (including the correct computation of the probability
+# of acceptance or rejection), so that as inference programmers, we need only
+# think about what sorts of updates might be useful to propose. Starting in
+# this section, we'll look at several design patterns for MCMC updates, and how
+# to apply them in Gen.
 
 # ### Block Resimulation
 
-# One of the simplest strategies we can use is called Resimulation MH, and it works as follows.
+# One of the simplest strategies we can use is called Resimulation MH, and it
+# works as follows.
 #
-# We begin, as in most iterative inference algorithms, by sampling an initial trace from our model, fixing the observed choices to their observed values.
+# We begin, as in most iterative inference algorithms, by sampling an initial
+# trace from our model, fixing the observed choices to their observed values.
 #
 # ```julia
-# # Gen's `initialize` function accepts a model, a tuple of arguments to the model,
-# # and an Assignment representing observations (or constraints to satisfy). It returns
+# # Gen's `generate` function accepts a model, a tuple of arguments to the model,
+# # and a `ChoiceMap` representing observations (or constraints to satisfy). It returns
 # # a complete trace consistent with the observations, and an importance weight.
-# (tr, _) = initialize(model, (xs,), observations)
+# (tr, _) = generate(regression_with_outliers, (xs,), observations)
 # ```
 
-# Then, in each iteration of our program, we propose changes to all our model's variables in "blocks," by erasing a set of variables from our current trace and _resimulating_ them from the model. After resimulating each block of choices, we perform an accept/reject step, deciding whether the proposed changes are worth making. 
+# Then, in each iteration of our program, we propose changes to all our model's
+# variables in "blocks," by erasing a set of variables from our current trace
+# and _resimulating_ them from the model. After resimulating each block of
+# choices, we perform an accept/reject step, deciding whether the proposed
+# changes are worth making.
 #
 # ```julia
 # # Pseudocode
@@ -248,17 +332,41 @@ println("Average log probability: $(logmeanexp(log_probs))")
 # end
 # ```
 #
-# The main design choice in designing a Block Resimulation MH algorithm is how to block the choices together for resimulation. At one extreme, we could put each random choice the model makes in its own block. At the other, we could put all variables into a single block (a strategy sometimes called "independent" MH, and which bears a strong similarity to importance resampling, as it involves repeatedly generating completely new traces and deciding whether to keep them or not). Usually, the right thing to do is somewhere in between.
+# The main design choice in designing a Block Resimulation MH algorithm is how
+# to block the choices together for resimulation. At one extreme, we could put
+# each random choice the model makes in its own block. At the other, we could
+# put all variables into a single block (a strategy sometimes called
+# "independent" MH, and which bears a strong similarity to importance
+# resampling, as it involves repeatedly generating completely new traces and
+# deciding whether to keep them or not). Usually, the right thing to do is
+# somewhere in between.
 #
 # For the regression problem, here is one possible blocking of choices:
 #
-# **Block 1: `slope`, `intercept`, and `noise`.** These parameters determine the linear relationship; resimulating them is like picking a new line. We know from our importance sampling experiment above that before too long, we're bound to sample something close to the right line.
-#
-# **Blocks 2 through N+1: Each `is_outlier`, in its own block.** One problem we saw with importance sampling in this problem was that it tried to sample _every_ outlier classification at once, when in reality the chances of a single sample that correctly classifies all the points are very low. Here, we can choose to resimulate each `is_outlier` choice separately, and for each one, decide whether to use the resimulated value or not.
-#
-# **Block N+2: `prob_outlier`.** Finally, we can propose a new `prob_outlier` value; in general, we can expect to accept the proposal when it is line with the current hypothesized proportion of `is_outlier` choices that are set to `true`.
-#
-# Resimulating a block of variables is the simplest form of update that Gen's `metropolis_hastings` operator (or `mh` for short) supports. When supplied with a _current trace_ and a _selection_ of trace addresses to resimulate, `mh` performs the resimulation and the appropriate accept/reject check, then returns a possibly updated trace. A selection is created using the `select` method. So a single update of the scheme we proposed above would look like this:
+# **Block 1: `slope`, `intercept`, and `noise`.** These parameters determine
+# the linear relationship; resimulating them is like picking a new line. We
+# know from our importance sampling experiment above that before too long,
+# we're bound to sample something close to the right line.
+#
+# **Blocks 2 through N+1: Each `is_outlier`, in its own block.** One problem we
+# saw with importance sampling in this problem was that it tried to sample
+# _every_ outlier classification at once, when in reality the chances of a
+# single sample that correctly classifies all the points are very low. Here, we
+# can choose to resimulate each `is_outlier` choice separately, and for each
+# one, decide whether to use the resimulated value or not.
+#
+# **Block N+2: `prob_outlier`.** Finally, we can propose a new `prob_outlier`
+# value; in general, we can expect to accept the proposal when it is line with
+# the current hypothesized proportion of `is_outlier` choices that are set to
+# `true`.
+#
+# Resimulating a block of variables is the simplest form of update that Gen's
+# `metropolis_hastings` operator (or `mh` for short) supports. When supplied
+# with a _current trace_ and a _selection_ of trace addresses to resimulate,
+# `mh` performs the resimulation and the appropriate accept/reject check, then
+# returns a possibly updated trace. A selection is created using the `select`
+# method. So a single update of the scheme we proposed above would look like
+# this:
 
 # Perform a single block resimulation update of a trace.
 function block_resimulation_update(tr)
@@ -280,11 +388,12 @@ function block_resimulation_update(tr)
     tr
 end;
 
-# All that's left is to (a) obtain an initial trace, and then (b) run that update in a loop for as long as we'd like:
+# All that's left is to (a) obtain an initial trace, and then (b) run that
+# update in a loop for as long as we'd like:
 
 function block_resimulation_inference(xs, ys)
     observations = make_constraints(ys)
-    (tr, _) = generate(model, (xs,), observations)
+    (tr, _) = generate(regression_with_outliers, (xs,), observations)
     for iter=1:500
         tr = block_resimulation_update(tr)
     end
@@ -300,23 +409,29 @@ for i=1:10
 end
 println("Log probability: ", logmeanexp(scores))
 
-# We note that this is significantly better than importance sampling, even if we run importance sampling for about the same amount of (wall-clock) time per sample:
+# We note that this is significantly better than importance sampling, even if
+# we run importance sampling for about the same amount of (wall-clock) time per
+# sample:
 
 scores = Vector{Float64}(undef, 10)
 for i=1:10
-    @time (tr, _) = importance_resampling(model, (xs,), observations, 17000)
+    @time (tr, _) = importance_resampling(regression_with_outliers, (xs,), observations, 17000)
     scores[i] = get_score(tr)
 end
 println("Log probability: ", logmeanexp(scores))
 
-# It's one thing to see a log probability increase; it's better to understand what the inference algorithm is actually doing, and to see _why_ it's doing better.
+# It's one thing to see a log probability increase; it's better to understand
+# what the inference algorithm is actually doing, and to see _why_ it's doing
+# better.
 #
-# A great tool for debugging and improving MCMC algorithms is visualization. We can use GenViz's `displayInNotebook(viz) do ... end` syntax to produce an animated visualization:
+# A great tool for debugging and improving MCMC algorithms is visualization. We
+# can use GenViz's `displayInNotebook(viz) do ... end` syntax to produce an
+# animated visualization:
 
 viz = Viz(server, joinpath(@__DIR__, "regression-viz/dist"), [xs, ys])
 Random.seed!(2)
 displayInNotebook(viz) do
-    (tr, _) = generate(model, (xs,), observations)
+    (tr, _) = generate(regression_with_outliers, (xs,), observations)
     putTrace!(viz, "t", serialize_trace(tr))
     for iter = 1:500
         tr = block_resimulation_update(tr)
@@ -335,17 +450,26 @@ end
 # (tr, did_accept) = mh(tr, select(:address1, :address2, ...))
 # ```
 #
-# This update is incremental in that it only proposes changes to part of a trace (the selected addresses). But when computing _what_ changes to propose, it ignores the current state completely and resimulates all-new values from the model.
+# This update is incremental in that it only proposes changes to part of a
+# trace (the selected addresses). But when computing _what_ changes to propose,
+# it ignores the current state completely and resimulates all-new values from
+# the model.
 #
-# That wholesale resimulation of values is often not the best way to search for improvements. To that end, Gen also offers a more general flavor of MH:
+# That wholesale resimulation of values is often not the best way to search for
+# improvements. To that end, Gen also offers a more general flavor of MH:
 #
 # ```julia
 # (tr, did_accept) = mh(tr, custom_proposal, custom_proposal_args)
 # ```
 #
-# A "custom proposal" is just what it sounds like: whereas before, we were using the _default resimulation proposal_ to come up with new values for the selected addresses, we can now pass in a generative function that samples proposed values however it wants.
+# A "custom proposal" is just what it sounds like: whereas before, we were
+# using the _default resimulation proposal_ to come up with new values for the
+# selected addresses, we can now pass in a generative function that samples
+# proposed values however it wants.
 #
-# For example, here is a custom proposal that takes in a current trace, and proposes a new slope and intercept by randomly perturbing the existing values:
+# For example, here is a custom proposal that takes in a current trace, and
+# proposes a new slope and intercept by randomly perturbing the existing
+# values:
 
 @gen function line_proposal(trace)
     choices = get_choices(trace)
@@ -362,8 +486,14 @@ end;
 # ```
 #
 # Two things to note:
-# 1. We no longer need to pass a selection of addresses. Instead, Gen assumes that whichever addresses are sampled by the proposal (in this case, `:slope` and `:intercept`) are being proposed to.
-# 2. The argument list to the proposal is an empty tuple, `()`. The `line_proposal` generative function does expect an argument, the previous trace, but this is supplied automatically to all MH custom proposals.
+#
+# 1. We no longer need to pass a selection of addresses. Instead, Gen assumes
+#    that whichever addresses are sampled by the proposal (in this case,
+#    `:slope` and `:intercept`) are being proposed to.
+#
+# 2. The argument list to the proposal is an empty tuple, `()`. The
+#    `line_proposal` generative function does expect an argument, the previous
+#    trace, but this is supplied automatically to all MH custom proposals.
 
 # Let's swap it into our update:
 
@@ -384,7 +514,8 @@ function gaussian_drift_update(tr)
     tr
 end;
 
-# If we compare the Gaussian Drift proposal visually with our old algorithm, we can see how the new behavior helps:
+# If we compare the Gaussian Drift proposal visually with our old algorithm, we
+# can see how the new behavior helps:
 
 # +
 viz = Viz(server, joinpath(@__DIR__, "regression-viz/dist"), [xs, ys])
@@ -395,7 +526,7 @@ Random.seed!(35)
 # Create the animation
 displayInNotebook(viz) do
     # Get an initial trace
-    (tr1, _) = generate(model, (xs,), observations)
+    (tr1, _) = generate(regression_with_outliers, (xs,), observations)
     tr2 = tr1
     
     # Visualize the initial trace twice
@@ -418,26 +549,27 @@ displayInNotebook(viz) do
 end
 # -
 
-# <hr />
+# -----------
 
 # ### Exercise: Analyzing the algorithms
 
-# Run the cell above several times with different random seeds. Compare the two algorithms with respect to the following:
+# Run the cell above several times with different random seeds. Compare the two
+# algorithms with respect to the following:
 #
 # - How fast do they find a relatively good line?
 #
-# - Does one of them tend to get stuck more than the other? Under what conditions? Why?
+# - Does one of them tend to get stuck more than the other? Under what
+#   conditions? Why?
 
-# *Your answer here*
+# -----------
 
-# <hr />
-
-# A more quantitative comparison demonstrates that our change has definitely improved our inference quality:
+# A more quantitative comparison demonstrates that our change has definitely
+# improved our inference quality:
 
 # +
 Random.seed!(1)
 function gaussian_drift_inference()
-    (tr, _) = generate(model, (xs,), observations)
+    (tr, _) = generate(regression_with_outliers, (xs,), observations)
     for iter=1:500
         tr = gaussian_drift_update(tr)
     end
@@ -454,13 +586,18 @@ println("Log probability: ", logmeanexp(scores))
 
 # ## 6. MCMC Inference Part 3: Heuristics to guide the process  <a name="mcmc-3"></a>
 
-# In this section, we'll look at another strategy for improving MCMC inference: using arbitrary heuristics to make smarter proposals. In particular, we'll use a method called "Random Sample Consensus" (or RANSAC) to quickly find promising settings of the slope and intercept parameters.
+# In this section, we'll look at another strategy for improving MCMC inference:
+# using arbitrary heuristics to make smarter proposals. In particular, we'll
+# use a method called "Random Sample Consensus" (or RANSAC) to quickly find
+# promising settings of the slope and intercept parameters.
 #
 # RANSAC works as follows:
 # 1. We repeatedly choose a small random subset of the points, say, of size 3.
-# 2. We do least-squares linear regression to find a line of best fit for those points.
+# 2. We do least-squares linear regression to find a line of best fit for those
+#    points.
 # 3. We count how many points (from the entire set) are near the line we found.
-# 4. After a suitable number of iterations (say, 10), we return the line that had the highest score.
+# 4. After a suitable number of iterations (say, 10), we return the line that
+#    had the highest score.
 #
 # Here's our implementation of the algorithm in Julia:
 
@@ -468,13 +605,13 @@ println("Log probability: ", logmeanexp(scores))
 import StatsBase
 
 struct RANSACParams
-    # the number of random subsets to try
+    """the number of random subsets to try"""
     iters::Int
 
-    # the number of points to use to construct a hypothesis
+    """the number of points to use to construct a hypothesis"""
     subset_size::Int
 
-    # the error threshold below which a datum is considered an inlier
+    """the error threshold below which a datum is considered an inlier"""
     eps::Float64
     
     function RANSACParams(iters, subset_size, eps)
@@ -517,7 +654,8 @@ function ransac(xs::Vector{Float64}, ys::Vector{Float64}, params::RANSACParams)
 end;
 # -
 
-# We can now wrap it in a Gen proposal that calls out to RANSAC, then samples a slope and intercept near the one it proposed.
+# We can now wrap it in a Gen proposal that calls out to RANSAC, then samples a
+# slope and intercept near the one it proposed.
 
 @gen function ransac_proposal(prev_trace, xs, ys)
     (slope, intercept) = ransac(xs, ys, RANSACParams(10, 3, 1.))
@@ -525,7 +663,14 @@ end;
     @trace(normal(intercept, 1.0), :intercept)
 end;
 
-# (Notice that although `ransac` makes random choices, they are not addressed (and they happen outside of a Gen generative function), so Gen cannot reason about them. This is OK (see [Using probabilistic programs as proposals](https://arxiv.org/abs/1801.03612)). Writing proposals that have traced internal randomness (i.e., that make traced random choices that are not directly used in the proposal) can lead to better inference, but requires the use of a more complex version of Gen's `mh` operator, which is beyond the scope of this tutorial.)
+# (Notice that although `ransac` makes random choices, they are not addressed
+# (and they happen outside of a Gen generative function), so Gen cannot reason
+# about them. This is OK (see [Using probabilistic programs as
+# proposals](https://arxiv.org/abs/1801.03612)). Writing proposals that have
+# traced internal randomness (i.e., that make traced random choices that are
+# not directly used in the proposal) can lead to better inference, but requires
+# the use of a more complex version of Gen's `mh` operator, which is beyond the
+# scope of this tutorial.)
 
 # One iteration of our update algorithm will now look like this:
 
@@ -549,7 +694,10 @@ function ransac_update(tr)
     tr
 end
 
-# We can now run our main loop for just 5 iterations, and achieve pretty good results. (Of course, since we do 20 inner loop iterations in `ransac_update`, this is really closer to 100 iterations.) The running time is significantly lower than before, without a real dip in quality:
+# We can now run our main loop for just 5 iterations, and achieve pretty good
+# results. (Of course, since we do 20 inner loop iterations in `ransac_update`,
+# this is really closer to 100 iterations.) The running time is significantly
+# lower than before, without a real dip in quality:
 
 # +
 function ransac_inference()
@@ -557,7 +705,7 @@ function ransac_inference()
     slope_intercept_init = choicemap()
     slope_intercept_init[:slope] = slope
     slope_intercept_init[:intercept] = intercept
-    (tr, _) = generate(model, (xs,), merge(observations, slope_intercept_init))
+    (tr, _) = generate(regression_with_outliers, (xs,), merge(observations, slope_intercept_init))
     for iter=1:5
         tr = ransac_update(tr)
     end
@@ -580,7 +728,7 @@ displayInNotebook(viz) do
     slope_intercept_init = choicemap()
     slope_intercept_init[:slope] = slope
     slope_intercept_init[:intercept] = intercept
-    (tr, _) = generate(model, (xs,), merge(observations, slope_intercept_init))
+    (tr, _) = generate(regression_with_outliers, (xs,), merge(observations, slope_intercept_init))
     putTrace!(viz, "t", serialize_trace(tr))
     for iter = 1:5
         (tr, _) = mh(tr, ransac_proposal, (xs, ys))
@@ -602,29 +750,49 @@ displayInNotebook(viz) do
     end
 end
 
-# <hr />
+# -----------
 #
 # ### Exercise: Improving the heuristic
-# Currently, the RANSAC heuristic does not use the current trace's information at all. Try changing it to use the current state as follows:
+# Currently, the RANSAC heuristic does not use the current trace's information
+# at all. Try changing it to use the current state as follows:
 #
-# Instead of a constant `eps` parameter that controls whether a point is considered an inlier, make this decision based on the currently hypothesized `noise` level.
+# Instead of a constant `eps` parameter that controls whether a point is
+# considered an inlier, make this decision based on the currently hypothesized
+# `noise` level.
 #
 # Does this improve the inference? Do you have a guess as to why?
-# <hr />
+
+# -----------
 #
 # ### Exercise: A different heuristic
-# Implement a heuristic-based proposal that selects the points that are currently classified as _inliers_, finds the line of best fit, and adds some noise.
-# <hr />
+# Implement a heuristic-based proposal that selects the points that are
+# currently classified as _inliers_, finds the line of best fit, and adds some
+# noise.
+# -----------
 
 # ### Exercise: Initialization
 #
-# In our inference program above, when generating an initial trace on which to iterate, we initialize the slope and intercept to values proposed by RANSAC. If we don't do this, the performance decreases sharply, despite the fact that we still propose new slope/intercept pairs from RANSAC once the loop starts. Why is this?
+# In our inference program above, when generating an initial trace on which to
+# iterate, we initialize the slope and intercept to values proposed by RANSAC.
+# If we don't do this, the performance decreases sharply, despite the fact that
+# we still propose new slope/intercept pairs from RANSAC once the loop starts.
+# Why is this?
 
 # ## 7. MAP Optimization  <a name="map"></a>
 
-# Everything we've done so far has been within the MCMC framework. But sometimes you're not interested in getting posterior samples—sometimes you just want a single likely explanation for your data. Gen also provides tools for _maximum a posteriori_ estimation ("MAP estimation"), the problem of finding a trace that maximizes the posterior probability under the model given observations.
+# Everything we've done so far has been within the MCMC framework. But
+# sometimes you're not interested in getting posterior samples—sometimes you
+# just want a single likely explanation for your data. Gen also provides tools
+# for _maximum a posteriori_ estimation ("MAP estimation"), the problem of
+# finding a trace that maximizes the posterior probability under the model
+# given observations.
 
-# For example, let's say we wanted to take a trace and assign each point's `is_outlier` score to the most likely possibility. We can do this by iterating over both possible traces, scoring them, and choosing the one with the higher score. We can do this using Gen's [`update`](https://probcomp.github.io/Gen/dev/ref/gfi/#Update-1) function, which allows us to manually update a trace to satisfy some constraints:
+# For example, let's say we wanted to take a trace and assign each point's
+# `is_outlier` score to the most likely possibility. We can do this by
+# iterating over both possible traces, scoring them, and choosing the one with
+# the higher score. We can do this using Gen's
+# [`update`](https://probcomp.github.io/Gen/dev/ref/gfi/#Update-1) function,
+# which allows us to manually update a trace to satisfy some constraints:
 
 function is_outlier_map_update(tr)
     (xs,) = get_args(tr)
@@ -654,7 +822,7 @@ ransac_score, final_score = displayInNotebook(viz) do
     slope_intercept_init = choicemap()
     slope_intercept_init[:slope] = slope
     slope_intercept_init[:intercept] = intercept
-    (tr,) = generate(model, (xs,), merge(observations, slope_intercept_init))
+    (tr,) = generate(regression_with_outliers, (xs,), merge(observations, slope_intercept_init))
     sleep(1)
     putTrace!(viz, "t", serialize_trace(tr))
     for iter=1:5
@@ -697,7 +865,7 @@ function map_inference()
     slope_intercept_init = choicemap()
     slope_intercept_init[:slope] = slope
     slope_intercept_init[:intercept] = intercept
-    (tr, _) = generate(model, (xs,), merge(observations, slope_intercept_init))
+    (tr, _) = generate(regression_with_outliers, (xs,), merge(observations, slope_intercept_init))
     for iter=1:5
         tr = ransac_update(tr)
     end
@@ -727,15 +895,24 @@ end
 println(logmeanexp(scores))
 # -
 
-# This doesn't necessarily mean that it's "better," though. It finds the most probable explanation of the data, which is a different problem from the one we tackled with MCMC inference. There, the goal was to sample from the posterior, which allows us to better characterize our uncertainty. Using MCMC, there might be a borderline point that is sometimes classified as an outlier and sometimes not, reflecting our uncertainty; with MAP optimization, we will always be shown the most probable answer.
+# This doesn't necessarily mean that it's "better," though. It finds the most
+# probable explanation of the data, which is a different problem from the one
+# we tackled with MCMC inference. There, the goal was to sample from the
+# posterior, which allows us to better characterize our uncertainty. Using
+# MCMC, there might be a borderline point that is sometimes classified as an
+# outlier and sometimes not, reflecting our uncertainty; with MAP optimization,
+# we will always be shown the most probable answer.
 
-# <hr />
-#
+# ---------
 # ## Exercise: Bimodal posterior
-# Generate a dataset for which there are two distinct possible explanations under our model. Then, for each algorithm discussed above, consider
 #
-# 1. Whether the algorithm will be able to generate high-probability samples at all;
+# Generate a dataset for which there are two distinct possible explanations
+# under our model `regression_with_outliers`. Then, for each algorithm
+# discussed above, consider
+#
+# 1. Whether the algorithm will be able to generate high-probability samples at
+#    all;
 # 2. Whether the algorithm, across runs, will generate samples from both modes;
 # 3. Whether the algorithm, within a single run, will explore both modes.
 #
-# <hr />
+# ---------
diff --git a/tutorials/Modeling with Black-Box Julia Code.jl b/tutorials/Modeling with Black-Box Julia Code.jl
index 0f83cbf..01d9cbf 100644
--- a/tutorials/Modeling with Black-Box Julia Code.jl	
+++ b/tutorials/Modeling with Black-Box Julia Code.jl	
@@ -14,9 +14,15 @@
 
 # # Modeling with black-box Julia code 
 
-# This notebook shows how "black-box" code like algorithms and simulators can be included in probabilistic models that are expressed as generative functions.
+# This notebook shows how "black-box" code like algorithms and simulators can
+# be included in probabilistic models that are expressed as generative
+# functions.
 
-# We will write a generative probabilistic model of the motion of an intelligent agent that is navigating a two-dimensional scene. The model will be *algorithmic* --- it will invoke a path planning algorithm implemented in regular Julia code to generate the agent's motion plan from its destination its map of the scene.
+# We will write a generative probabilistic model of the motion of an
+# intelligent agent that is navigating a two-dimensional scene. The model will
+# be *algorithmic* --- it will invoke a path planning algorithm implemented in
+# regular Julia code to generate the agent's motion plan from its destination
+# its map of the scene.
 
 using Gen
 
@@ -29,7 +35,11 @@ using GenViz
 viz_server = VizServer(8090)
 sleep(1)
 
-# First, we load some basic geometric primitives for a two-dimensional scene in the following file:
+# ##### Warning -- do not execute the above cell twice.
+#
+#
+# First, we load some basic geometric primitives for a two-dimensional scene in
+# the following file:
 
 include("../inverse-planning/geometric_primitives.jl");
 
@@ -39,15 +49,19 @@ point = Point(1.0, 2.0)
 println(point.x)
 println(point.y)
 
-# The file also defines an `Obstacle` data type, which represents a polygonal obstacle in a two-dimensional scene, that is constructed from a list of vertices. Here, we construct a square:
+# The file also defines an `Obstacle` data type, which represents a polygonal
+# obstacle in a two-dimensional scene, that is constructed from a list of
+# vertices. Here, we construct a square:
 
 obstacle = Obstacle([Point(0.0, 0.0), Point(1.0, 0.0), Point(0.0, 1.0), Point(1.0, 1.0)]);
 
-# Next, we load the definition of a `Scene` data type that represents a two-dimensional scene.
+# Next, we load the definition of a `Scene` data type that represents a
+# two-dimensional scene.
 
 include("../inverse-planning/scene.jl");
 
-# The scene spans a rectangle of on the two-dimensional x-y plane, and contains a list of obstacles, which is initially empty:
+# The scene spans a rectangle of on the two-dimensional x-y plane, and contains
+# a list of obstacles, which is initially empty:
 #
 # `scene = Scene(xmin::Real, xmax::Real, ymin::Real, ymax::real)`
 
@@ -59,11 +73,15 @@ add_obstacle!(scene, obstacle);
 
 # The file also defines functions that simplify the construction of obstacles:
 #
-# `make_square(center::Point, size::Float64)` constructs a square-shaped obstacle centered at the given point with the given side length:
+# `make_square(center::Point, size::Float64)` constructs a square-shaped
+# obstacle centered at the given point with the given side length:
 
 obstacle = make_square(Point(0.30, 0.20), 0.1)
 
-# `make_line(vertical::Bool, start::Point, length::Float64, thickness::Float64)` constructs an axis-aligned line (either vertical or horizontal) with given thickness that extends from a given strating point for a certain length:
+# `make_line(vertical::Bool, start::Point, length::Float64,
+# thickness::Float64)` constructs an axis-aligned line (either vertical or
+# horizontal) with given thickness that extends from a given strating point for
+# a certain length:
 
 obstacle = make_line(false, Point(0.20, 0.40), 0.40, 0.02)
 
@@ -82,37 +100,55 @@ add_obstacle!(scene, make_line(horizontal, Point(0.60 - 0.15, 0.80), 0.15 + wall
 add_obstacle!(scene, make_line(horizontal, Point(0.20, 0.80), 0.15, wall_thickness))
 add_obstacle!(scene, make_line(vertical, Point(0.20, 0.40), 0.40, wall_thickness));
 
-# We visualize the scene below. Don't worry about the details of this cell. It should display a two-dimensional map of the scene.
+# We visualize the scene below. Don't worry about the details of this cell. It
+# should display a two-dimensional map of the scene.
 
 info = Dict("scene" => scene)
 viz = Viz(viz_server, joinpath(@__DIR__, "../inverse-planning/overlay-viz/dist"), info)
 displayInNotebook(viz)
 
-# Next, we load a file that defines a `Path` data type (a sequence of `Points`), and a `plan_path` method, which  uses a path planning algorithm based on rapidly exploring random tree (RRT, [1]) to find a sequence of `Point`s beginning with `start` and ending in `dest` such that the line segment between each consecutive pair of points does nt intersect any obstacles in the scene. The planning algorithm may fail to find a valid path, in which case it will return a value of type `Nothing`.
+# Next, we load a file that defines a `Path` data type (a sequence of
+# `Points`), and a `plan_path` method, which  uses a path planning algorithm
+# based on rapidly exploring random tree (RRT, [1]) to find a sequence of
+# `Point`s beginning with `start` and ending in `dest` such that the line
+# segment between each consecutive pair of points does nt intersect any
+# obstacles in the scene. The planning algorithm may fail to find a valid path,
+# in which case it will return a value of type `Nothing`.
 #
-# `path::Union{Path,Nothing} = plan_path(start::Point, dest::Point, scene::Scene, planner_params::PlannerParams)`
+# `path::Union{Path,Nothing} = plan_path(start::Point, dest::Point,
+# scene::Scene, planner_params::PlannerParams)`
 #
-# [1] Rapidly-exploring random trees: A new tool for path planning. S. M. LaValle. TR 98-11, Computer Science Dept., Iowa State University, October 1998,
+# [1] Rapidly-exploring random trees: A new tool for path planning. S. M.
+# LaValle. TR 98-11, Computer Science Dept., Iowa State University, October
+# 1998,
 
 include("../inverse-planning/planning.jl");
 
-# Let's use `plan_path` to plan a path from the lower-left corner of the scene into the interior of the box.
+# Let's use `plan_path` to plan a path from the lower-left corner of the scene
+# into the interior of the box.
 
 start = Point(0.1, 0.1)
 dest = Point(0.5, 0.5)
 planner_params = PlannerParams(300, 3.0, 2000, 1.)
 path = plan_path(start, dest, scene, planner_params)
 
-# We visualize the path below. The start location is shown in blue, the destination in red, and the path in orange. Run the cell above followed by the cell below a few times to see the variability in the paths generated by `plan_path` for these inputs.
+# We visualize the path below. The start location is shown in blue, the
+# destination in red, and the path in orange. Run the cell above followed by
+# the cell below a few times to see the variability in the paths generated by
+# `plan_path` for these inputs.
 
 info = Dict("start"=> start, "dest" => dest, "scene" => scene, "path_edges" => get_edges(path))
 viz = Viz(viz_server, joinpath(@__DIR__, "../inverse-planning/overlay-viz/dist"), info)
 displayInNotebook(viz)
 
 # We also need a model for how the agent moves along its path.
-# We will assume that the agent moves along its path a constant speed. The file loaded above also defines a method (`walk_path`) that computes the locations of the agent at a set of timepoints (sampled at time intervals of `dt` starting at time `0.`), given the path and the speed of the agent:
+# We will assume that the agent moves along its path a constant speed. The file
+# loaded above also defines a method (`walk_path`) that computes the locations
+# of the agent at a set of timepoints (sampled at time intervals of `dt`
+# starting at time `0.`), given the path and the speed of the agent:
 #
-# `locations::Vector{Point} =  walk_path(path::Path, speed::Float64, dt::Float64, num_ticks::Int)`
+# `locations::Vector{Point} =  walk_path(path::Path, speed::Float64,
+# dt::Float64, num_ticks::Int)`
 
 speed = 1.
 dt = 0.1
@@ -161,14 +197,20 @@ println(locations)
     return (planning_failed, maybe_path)
 end;
 
-# We can now perform a traced execution of `agent_model` and print out the random choices it made:
+# We can now perform a traced execution of `agent_model` and print out the
+# random choices it made:
 
 planner_params = PlannerParams(300, 3.0, 2000, 1.)
 (trace, _) = Gen.generate(agent_model, (scene, dt, num_ticks, planner_params));
 choices = Gen.get_choices(trace)
 println(choices)
 
-# Next we explore the assumptions of the model by sampling many traces from the generative function and visualizing them. We have created a visualization specialized for this generative function for use with the `GenViz` package, in the directory `../inverse-planning/grid-viz/dist`. We have also defined a `trace_to_dict` method to convert the trace into a value that can be automatically serialized into a JSON string for use by the visualization:
+# Next we explore the assumptions of the model by sampling many traces from the
+# generative function and visualizing them. We have created a visualization
+# specialized for this generative function for use with the `GenViz` package,
+# in the directory `../inverse-planning/grid-viz/dist`. We have also defined a
+# `trace_to_dict` method to convert the trace into a value that can be
+# automatically serialized into a JSON string for use by the visualization:
 
 function trace_to_dict(trace)
     args = Gen.get_args(trace)
@@ -202,7 +244,8 @@ function trace_to_dict(trace)
     return d
 end;
 
-# Let's visualize some traces of the function, with the start location fixed to a point in the lower-left corner:
+# Let's visualize some traces of the function, with the start location fixed to
+# a point in the lower-left corner:
 
 # +
 constraints = Gen.choicemap()
@@ -217,23 +260,34 @@ end
 displayInNotebook(viz)
 # -
 
-# In this visualization, the start location is represented by a blue dot, and the destination is represented by a red dot. The measured coordinates at each time point are represented by black dots. The path, if path planning was succesfull, is shown as a gray line fro the start point to the destination point. Notice that the speed of the agent is different in each case.
+# In this visualization, the start location is represented by a blue dot, and
+# the destination is represented by a red dot. The measured coordinates at each
+# time point are represented by black dots. The path, if path planning was
+# succesfull, is shown as a gray line fro the start point to the destination
+# point. Notice that the speed of the agent is different in each case.
 
 # ### Exercise
 #
-# Constrain the start and destination points to two particular locations in the scene, and visualize the distribution on paths. Find a start point and destination point such that the distributions on paths is multimodal (i.e. there are two spatially separated paths that the agent may take). Describe the variability in the planned paths.
+# Constrain the start and destination points to two particular locations in the
+# scene, and visualize the distribution on paths. Find a start point and
+# destination point such that the distributions on paths is multimodal (i.e.
+# there are two spatially separated paths that the agent may take). Describe
+# the variability in the planned paths.
 
 # ### Solution
 #
 #
 
-# We now write a simple algorithm for inferring the destination of an agent given (i) the scene, (ii) the start location of the agent, and (iii) a sequence of measured locations of the agent for each tick.
+# We now write a simple algorithm for inferring the destination of an agent
+# given (i) the scene, (ii) the start location of the agent, and (iii) a
+# sequence of measured locations of the agent for each tick.
 #
 # We will assume the agent starts in the lower left-hand corner.
 
 start = Point(0.1, 0.1);
 
-# We will infer the destination of the agent for the given sequence of observed locations:
+# We will infer the destination of the agent for the given sequence of observed
+# locations:
 
 measurements = [
     Point(0.0980245, 0.104775),
@@ -258,7 +312,7 @@ displayInNotebook(viz)
 function do_inference_agent_model(scene::Scene, dt::Float64, num_ticks::Int, planner_params::PlannerParams, start::Point,
                                   measurements::Vector{Point}, amount_of_computation::Int)
     
-    # Create an "Assignment" that maps model addresses (:y, i)
+    # Create a ChoiceMap that maps model addresses (:y, i)
     # to observed values ys[i]. We leave :slope and :intercept
     # unconstrained, because we want them to be inferred.
     observations = Gen.choicemap()
@@ -289,7 +343,13 @@ end
 displayInNotebook(viz)
 
 # ### Exercise
-# The first argument to `PlannerParams` is the number of iterations of the RRT algorithm to use. The third argument to `PlannerParams` is the number of iterations of path refinement. These parameters affect the distribution on paths of the agent. Visualize traces of the `agent_model` for with a couple different settings of these two parameters to the path planning algorithm for fixed starting point and destination point. Try setting them to smaller values. Discuss.
+# The first argument to `PlannerParams` is the number of iterations of the RRT
+# algorithm to use. The third argument to `PlannerParams` is the number of
+# iterations of path refinement. These parameters affect the distribution on
+# paths of the agent. Visualize traces of the `agent_model` with a couple of
+# different settings of these two parameters to the path planning algorithm for
+# fixed starting point and destination point. Try setting them to smaller
+# values. Discuss.
 
 # ### Solution
 #
diff --git a/tutorials/Modeling with TensorFlow Code.jl b/tutorials/Modeling with TensorFlow Code.jl
index de19292..e8ab023 100644
--- a/tutorials/Modeling with TensorFlow Code.jl	
+++ b/tutorials/Modeling with TensorFlow Code.jl	
@@ -14,30 +14,60 @@
 
 # # Modeling with TensorFlow code
 #
-# So far, we have seen generative functions that are defined only using the built-in modeling language, which uses the `@gen` keyword. However, Gen can also be extended with other modeling languages, as long as they produce generative functions that implement the [Generative Function Interface](https://probcomp.github.io/Gen/dev/ref/gfi/). The [GenTF](https://github.com/probcomp/GenTF) Julia package provides one such modeling language which allow generative functions to be constructed from user-defined TensorFlow computation graphs. Generative functions written in the built-in language can invoke generative functions defined using the GenTF language.
+# So far, we have seen generative functions that are defined only using the
+# built-in modeling language, which uses the `@gen` keyword. However, Gen can
+# also be extended with other modeling languages, as long as they produce
+# generative functions that implement the [Generative Function
+# Interface](https://probcomp.github.io/Gen/dev/ref/gfi/). The
+# [GenTF](https://github.com/probcomp/GenTF) Julia package provides one such
+# modeling language which allow generative functions to be constructed from
+# user-defined TensorFlow computation graphs. Generative functions written in
+# the built-in language can invoke generative functions defined using the GenTF
+# language.
 #
-# This notebook shows how to write a generative function in the GenTF language, how to invoke a GenTF generative function from a `@gen` function, and how to perform basic supervised training of a generative function. Specifically, we will train a softmax regression conditional inference model to generate the label of an MNIST digit given the pixels. Later tutorials will show how to use deep learning and TensorFlow to accelerate inference in generative models, using ideas from "amortized inference".
-
-# NOTE: Only attempt to run this notebook if you have a working installation of TensorFlow and GenTF (see the [GenTF installation instructions](https://probcomp.github.io/GenTF/dev/#Installation-1)).
+# This notebook shows how to write a generative function in the GenTF language,
+# how to invoke a GenTF generative function from a `@gen` function, and how to
+# perform basic supervised training of a generative function. Specifically, we
+# will train a softmax regression conditional inference model to generate the
+# label of an MNIST digit given the pixels. Later
+# tutorials
+# will show how to use deep learning and TensorFlow to accelerate inference in
+# generative models, using ideas from "amortized inference".
+
+# NOTE: Only attempt to run this notebook if you have a working installation of
+# TensorFlow and GenTF (see the [GenTF installation
+# instructions](https://probcomp.github.io/GenTF/dev/#Installation-1)).
 
 using Gen
 using PyPlot
 
-# First, we load the GenTF package and the PyCall package. The PyCall package is used because TensorFlow computation graphs are constructed using the TensorFlow Python API, and the PyCall package allows Python code to be run from Julia.
+# First, we load the GenTF package and the PyCall package. The PyCall package
+# is used because TensorFlow computation graphs are constructed using the
+# TensorFlow Python API, and the PyCall package allows Python code to be run
+# from Julia.
 
 using GenTF
 using PyCall
 
-# We text load the TensorFlow and TensorFlow.nn Python modules into our scope. The `@pyimport` macro is defined by PyCall.
+# We text load the TensorFlow and TensorFlow.nn Python modules into our scope.
+# The `@pyimport` macro is defined by PyCall.
 
 tf = pyimport("tensorflow")
 tf.compat.v1.disable_eager_execution()
 nn = tf.nn
 
-# Next, we define a TensorFlow computation graph. The graph will have placeholders for an N x 784 matrix of pixel values, where N is the number of images that will be processed in batch, and 784 is the number of pixels in an MNIST image (28x28). There are 10 possible digit classes. The `probs` Tensor is an N x 10 matrix, where each row of the matrix is the vector of normalized probabilities of each digit class for a single input image. Note that this code is largely identical to the corresponding Python code. We provide initial values for the weight and bias parameters that are computed in Julia (it is also possible to use TensorFlow initializers for this purpose).
+# Next, we define a TensorFlow computation graph. The graph will have
+# placeholders for an N x 784 matrix of pixel values, where N is the number of
+# images that will be processed in batch, and 784 is the number of pixels in an
+# MNIST image (28x28). There are 10 possible digit classes. The `probs` Tensor
+# is an N x 10 matrix, where each row of the matrix is the vector of normalized
+# probabilities of each digit class for a single input image. Note that this
+# code is largely identical to the corresponding Python code. We provide
+# initial values for the weight and bias parameters that are computed in Julia
+# (it is also possible to use TensorFlow initializers for this purpose).
 
 # +
-# input images, shape (N, 784)
+# A set of input images as a matrix, shape (N, 784)
 xs = tf.compat.v1.placeholder(tf.float64, shape=(nothing, 784))
 
 # weight matrix parameter for soft-max regression, shape (784, 10)
@@ -54,27 +84,45 @@ b = tf.compat.v1.Variable(init_b)
 probs = nn.softmax(tf.add(tf.matmul(xs, W), b), axis=1);
 # -
 
-# Next, we construct the generative function from this graph. The GenTF package provides a `TFFunction` type that implements the generative function interface. The `TFFunction` constructor takes:
+# Next, we construct the generative function from this graph. The GenTF package
+# provides a `TFFunction` type that implements the generative function
+# interface. The `TFFunction` constructor takes:
 #
-# (i) A vector of Tensor objects that will be the trainable parameters of the generative function (`[W, b]`). These should be TensorFlow variables.
+#   (i) A vector of Tensor objects that will be the trainable parameters of the
+#       generative function (`[W, b]`). These should be TensorFlow variables.
 #
-# (ii) A vector of Tensor object that are the inputs to the generative function (`[xs]`). These should be TensorFlow placeholders.
+#  (ii) A vector of Tensor object that are the inputs to the generative
+#       function (`[xs]`). These should be TensorFlow placeholders.
 #
-# (iii) The Tensor object that is the return value of the generative function (`probs`).
+# (iii) The Tensor object that is the return value of the generative function
+#       (`probs`).
 
 tf_softmax_model = TFFunction([W, b], [xs], probs);
 
-# The `TFFunction` constructor creates a new TensorFlow session that will be used to execute all TensorFlow code for this generative function. It is also TensorFlow possible to supply a session explicitly to the constructor. See the [GenTF documentation](https://probcomp.github.io/GenTF/dev/) for more details.
+# The `TFFunction` constructor creates a new TensorFlow session that will be
+# used to execute all TensorFlow code for this generative function. It is also
+# TensorFlow possible to supply a session explicitly to the constructor. See
+# the [GenTF documentation](https://probcomp.github.io/GenTF/dev/) for more
+# details.
+
+# To test `tf_softmax_model` we generate 5 synthetic images with random pixel values:
+
+synthetic_xs = rand(5, 784)
+
+# A single synthetic image can be accessed with:
 
-# We can run the resulting generative function on some fake input data. This causes the TensorFlow to execute code in the TensorFlow session associated with `tf_softmax_model`:
+synthetic_xs[1, :]
 
-fake_xs = rand(5, 784)
-probs = tf_softmax_model(fake_xs)
+# We can run the resulting generative function on this synthetic input data. 
+# This causes the TensorFlow to execute code in the TensorFlow session
+# associated with `tf_softmax_model`:
+
+probs = tf_softmax_model(synthetic_xs)
 println(size(probs))
 
-# We can also use `Gen.initialize` to obtain a trace of this generative function.
+# We can also use `Gen.regenerate` to obtain a trace of this generative function.
 
-(trace, _) = Gen.generate(tf_softmax_model, (fake_xs,));
+(trace, _) = Gen.generate(tf_softmax_model, (synthetic_xs,));
 
 #  Note that generative functions constructed using GenTF do not make random choices:
 
@@ -84,7 +132,9 @@ println(Gen.get_choices(trace))
 
 println(size(Gen.get_retval(trace)))
 
-# Finally, we write a generative function using the built-in modeling DSL that invokes the TFFunction generative function we just defined. Note that we wrap the call to `tf_softmax_model` in an `@addr` statement.
+# Finally, we write a generative function using the built-in modeling DSL that
+# invokes the TFFunction generative function we just defined. Note that we wrap
+# the call to `tf_softmax_model` in a `@trace` statement.
 
 @gen function digit_model(xs::Matrix{Float64})
     
@@ -101,26 +151,45 @@ println(size(Gen.get_retval(trace)))
     end
 end;
 
-# Let's obtain a trace of `digit_model` on the fake tiny input:
+# Let's obtain a trace of `digit_model` on the synthetic tiny input:
 
-(trace, _) = Gen.generate(digit_model, (fake_xs,));
+(trace, _) = Gen.generate(digit_model, (synthetic_xs,));
 
-# We see that the `net` generative function does not make any random choices. The only random choices are the digit labels for each input input:
+# We see that the `net` generative function does not make any random choices.
+# The only random choices are the digit labels for each input input:
 
 println(Gen.get_choices(trace))
 
-# Before the `digit_model` will be useful for anything, it needs to be trained. We load some code for loading batches of MNIST training data.
+# Before the `digit_model` will be useful for anything, it needs to be trained.
+# We load some code for loading batches of MNIST training data.
 
 include("mnist.jl")
 training_data_loader = MNISTTrainDataLoader();
 
-# Now, we train the trainable parameters of the `tf_softmax_model` generative function  (`W` and `b`) on the MNIST traing data. Note that these parameters are stored as the state of the TensorFlow variables. We will use the [`Gen.train!`](https://probcomp.github.io/Gen/dev/ref/inference/#Gen.train!) method, which supports supervised training of generative functions using stochastic gradient opimization methods. In particular, this method takes the generative function to be trained (`digit_model`), a Julia function of no arguments that generates a batch of training data, and the update to apply to the trainable parameters.
-
-# The `ParamUpdate` constructor takes the type of update to perform (in this case a gradient descent update with step size 0.00001), and a specification of which trainable parameters should be updated). Here, we request that the `W` and `b` trainable parameters of the `tf_softmax_model` generative function should be trained.
+# Now, we train the trainable parameters of the `tf_softmax_model` generative
+# function  (`W` and `b`) on the MNIST traing data. Note that these parameters
+# are stored as the state of the TensorFlow variables. We will use the
+# [`Gen.train!`](https://probcomp.github.io/Gen/dev/ref/inference/#Gen.train!)
+# method, which supports supervised training of generative functions using
+# stochastic gradient opimization methods. In particular, this method takes the
+# generative function to be trained (`digit_model`), a Julia function of no
+# arguments that generates a batch of training data, and the update to apply to
+# the trainable parameters.
+
+# The `ParamUpdate` constructor takes the type of update to perform (in this
+# case a gradient descent update with step size 0.00001), and a specification
+# of which trainable parameters should be updated). Here, we request that the
+# `W` and `b` trainable parameters of the `tf_softmax_model` generative
+# function should be trained.
 
 update = Gen.ParamUpdate(Gen.FixedStepGradientDescent(0.00001), tf_softmax_model => [W, b]);
 
-# For the data generator, we obtain a batch of 100 MNIST training images. The data generator must return a tuple, where the first element is a set of arguments to the generative function being trained (`(xs,)`) and the second element contains the values of random choices. `train!` attempts to maximize the expected log probability of these random choices given their corresponding input values.
+# For the data generator, we obtain a batch of 100 MNIST training images. The
+# data generator must return a tuple, where the first element is a set of
+# arguments to the generative function being trained (`(xs,)`) and the second
+# element contains the values of random choices. `train!` attempts to maximize
+# the expected log probability of these random choices given their
+# corresponding input values.
 
 function data_generator()
     (xs, ys) = next_batch(training_data_loader, 100)
@@ -134,7 +203,9 @@ function data_generator()
     ((xs,), constraints)
 end;
 
-# We run 10000 iterations of stochastic gradient descent, where each iteration uses a batch of 100 images to get a noisy gradient estimate. This might take one or two minutes.
+# We run 10000 iterations of stochastic gradient descent, where each iteration
+# uses a batch of 100 images to get a noisy gradient estimate. This might take
+# one or two minutes.
 
 @time scores = Gen.train!(digit_model, data_generator, update;
     num_epoch=10000, epoch_size=1, num_minibatch=1, minibatch_size=1, verbose=false);
@@ -147,7 +218,15 @@ ylabel("Estimate of expected conditional log likelihood");
 
 # ### Exercise
 #
-# It is common to "vectorize" deep learning code so that it runs on multiple inputs at a time. This is important for making efficient use of GPU resources when training. The TensorFlow code above is vectorized across images. Construct a new TFFunction that only runs on one image at a time, and use a Julia for loop over multiple invocations of this new TFFunction, one for each image. Run the training procedure for 100 iterations. Comment on the performance difference.
+# It is common to "vectorize" deep learning code so that it runs on multiple
+# inputs at a time. This is important for making efficient use of GPU resources
+# when training. The TensorFlow code above is vectorized across images.
+# Construct a new `TFFunction` that only runs on one image at a time, and use a
+# Julia `for` loop over multiple invocations of this new `TFFunction`, one for
+# each image.
+#
+# Run the training procedure for 100 iterations. Comment on the performance
+# difference.
 
 # ### Solution
 #
@@ -169,11 +248,13 @@ b_unvec = tf.compat.v1.Variable(init_b)
 probs_unvec = tf.squeeze(nn.softmax(tf.add(tf.matmul(tf.expand_dims(x, axis=0), W_unvec), b_unvec), axis=1), axis=0);
 # -
 
-# We also provide you with the definition of the new TFFunction based on this computation graph:
+# We also provide you with the definition of the new TFFunction based on this
+# computation graph:
 
 tf_softmax_model_single = TFFunction([W_unvec, b_unvec], [x], probs_unvec);
 
-# We will use an update that modifies the parameters of `tf_softmax_model_single`:
+# We will use an update that modifies the parameters of
+# `tf_softmax_model_single`:
 
 update_unvec = Gen.ParamUpdate(Gen.FixedStepGradientDescent(0.00001), tf_softmax_model_single => [W_unvec, b_unvec]);
 
@@ -203,7 +284,8 @@ end;
     return nothing
 end;
 
-# After you have filled in the cells above, try running `digit_model_unvectorized` on some input to help debug your solution. 
+# After you have filled in the cells above, try running
+# `digit_model_unvectorized` on some input to help debug your solution.
 
 # Next, fill in the data generator that you will use to train the new model:
 
diff --git a/tutorials/Particle Filtering in Gen.jl b/tutorials/Particle Filtering in Gen.jl
index 21d30f9..b504255 100644
--- a/tutorials/Particle Filtering in Gen.jl	
+++ b/tutorials/Particle Filtering in Gen.jl	
@@ -17,11 +17,23 @@
 #
 # Sequential Monte Carlo (SMC) methods such as particle filtering iteratively solve a *sequence of inference problems* using techniques based on importance sampling and in some cases MCMC. The solution to each problem in the sequence is represented as a collection of samples or *particles*. The particles for each problem are based on extending or adjusting the particles for the previous problem in the sequence.
 #
-# The sequence of inference problems that are solved often arise naturally from observations that arrive incrementally, as in particle filtering. The problems can also be constructed instrumentally to facilitate inference, as in annealed importance sampling [3]. This tutorial shows how to use Gen to implement a particle filter for a tracking problem that uses "rejuvenation" MCMC moves. Specifically, we will address the "bearings only tracking" problem described in [4].
+# The sequence of inference problems that are solved often arise naturally from
+# observations that arrive incrementally, as in particle filtering. The
+# problems can also be constructed instrumentally to facilitate inference, as
+# in annealed importance sampling [3]. This
+# tutorial
+# shows how to use Gen to implement a particle filter for a tracking problem
+# that uses "rejuvenation" MCMC moves. Specifically, we will address the
+# "bearings only tracking" problem described in [4].
+#
+# **If you are not familiar with Particle Filtering we recommend to first read
+# [1] and/or [2] before
+# working through this tutorial.**
+#
 #
 # [1] Doucet, Arnaud, Nando De Freitas, and Neil Gordon. "An introduction to sequential Monte Carlo methods." Sequential Monte Carlo methods in practice. Springer, New York, NY, 2001. 3-14.
 #
-# [2] Del Moral, Pierre, Arnaud Doucet, and Ajay Jasra. "Sequential monte carlo samplers." Journal of the Royal Statistical Society: Series B (Statistical Methodology) 68.3 (2006): 411-436.
+# [2] Del Moral, Pierre, Arnaud Doucet, and Ajay Jasra. "Sequential Monte Carlo samplers." Journal of the Royal Statistical Society: Series B (Statistical Methodology) 68.3 (2006): 411-436.
 #
 # [3] Neal, Radford M. "Annealed importance sampling." Statistics and computing 11.2 (2001): 125-139.
 #
@@ -42,17 +54,26 @@ using PyPlot
 
 # ## 1. Implementing the generative model <a name="basic-model"></a>
 #
-# We will implement a generative model for the movement of a point in the x-y plane and bearing measurements of the location of this point relative to the origin over time.
+# We will implement a generative model for the movement of a point in the x-y
+# plane and bearing measurements of the location of this point relative to the
+# origin over time.
 #
-# We assume that we know the approximate initial position and velocity of the point. We assume the point's x- and y- velocity are subject to random perturbations drawn from some normal distribution with a known variance. Each bearing measurement consists of the angle of the point being tracked relative to the positive x-axis.
+# We assume that we know the approximate initial position and velocity of the
+# point. We assume the point's x- and y- velocity are subject to random
+# perturbations drawn from some normal distribution with a known variance. Each
+# bearing measurement consists of the angle of the point being tracked relative
+# to the positive x-axis.
 #
-# We write the generative model as a generative function below. The function first samples the initial state of the point from a prior distribution, and then generates `T` successive states in a `for` loop. The argument to the model is the number of states not including the initial state.
+# We write the generative model as a generative function below. The function
+# first samples the initial state of the point from a prior distribution, and
+# then generates `T` successive states in a `for` loop. The argument to the
+# model is the number of states not including the initial state.
 
 # +
 bearing(x, y) = atan(y, x)
 
 @gen function model(T::Int)
-    
+
     measurement_noise = 0.005
     velocity_var = (1.0/1e6)
 
@@ -61,32 +82,32 @@ bearing(x, y) = atan(y, x)
 
     # prior on initial x-coordinate
     x = @trace(normal(0.01, 0.01), :x0)
-       
+
     # prior on initial y-coordinate
     y = @trace(normal(0.95, 0.01), :y0)
-    
+
     # prior on x-component of initial velocity
     vx = @trace(normal(0.002, 0.01), :vx0)
-    
+
     # prior on y-component of initial velocity
     vy = @trace(normal(-0.013, 0.01), :vy0)
-    
+
     # initial bearing measurement
     @trace(normal(bearing(x, y), measurement_noise), :z0)
 
     # record position
     xs[1] = x
     ys[1] = y
-    
+
     # generate successive states and measurements
     for t=1:T
-        
+
         # update the state of the point
         vx = @trace(normal(vx, sqrt(velocity_var)), (:vx, t))
         vy = @trace(normal(vy, sqrt(velocity_var)), (:vy, t))
         x += vx
         y += vy
-        
+
         # bearing measurement
         @trace(normal(bearing(x, y), measurement_noise), (:z, t))
 
@@ -94,13 +115,14 @@ bearing(x, y) = atan(y, x)
         xs[t+1] = x
         ys[t+1] = y
     end
-    
+
     # return the sequence of positions
     return (xs, ys)
 end;
 # -
 
-# We generate a data set of positions, and observed bearings, by sampling from this model, with `T=50`:
+# We generate a data set of positions, and observed bearings, by sampling from
+# this model, with `T=50`:
 
 # +
 import Random
@@ -118,9 +140,19 @@ zs[1] = choices[:z0]
 for t=1:T
     zs[t+1] = choices[(:z, t)]
 end
+# Plot observed data.
+for z in zs
+    dx = cos(z) * 0.5
+    dy = sin(z) * 0.5
+    plot([0., dx], [0., dy], color="red", alpha=0.3)
+end
+xlabel("X");
+ylabel("Y");
+title("Observed bearings (red)");
 # -
 
-# We next write a visualization for traces of this model below. It shows the positions and dots and the observed bearings as lines from the origin:
+# We next write a visualization for traces of this model below. It shows the
+# positions as dots and the observed bearings as lines from the origin:
 
 function render(trace; show_data=true, max_T=get_args(trace)[1])
     (T,) = Gen.get_args(trace)
@@ -144,98 +176,174 @@ end;
 # We visualize the synthetic trace below:
 
 render(trace)
+xlabel("X");
+ylabel("Y");
+title("Observed bearings (red) and positions (blue)");
 
 # ## 2. Implementing a basic particle filter <a name="basic-pf"></a>
 #
-# In Gen, a **particle is represented as a trace** and the particle filter state contains a weighted collection of traces. Below we define an inference program that runs a particle filter on an observed data set of bearings (`zs`). We use `num_particles` particles internally, and then we return a sample of `num_samples` traces from the weighted collection that the particle filter produces.
+# In Gen, a **particle is represented as a trace** and the particle filter
+# state contains a weighted collection of traces. Below we define an inference
+# program that runs a particle filter on an observed data set of bearings
+# (`zs`). We use `num_particles` particles internally, and then we return a
+# sample of `num_samples` traces from the weighted collection that the particle
+# filter produces.
 #
-# Gen provides methods for initializing and updating the state of a particle filter, documented in [Particle Filtering](https://probcomp.github.io/Gen/dev/ref/inference/#Particle-Filtering-1).
+# Gen provides methods for initializing and updating the state of a particle
+# filter, documented in [Particle
+# Filtering](https://probcomp.github.io/Gen/dev/ref/inference/#Particle-Filtering-1).
 #
 # - `Gen.initialize_particle_filter`
 #
 # - `Gen.particle_filter_step!`
 #
-# Both of these methods can used either with the default proposal or a custom proposal. In this tutorial, we will use the default proposal. There is also a method that resamples particles based on their weights, which serves to redistribute the particles to more promising parts of the latent space.
+# Both of these methods can used either with the default proposal or a custom
+# proposal. In this
+# tutorial,
+# we will use the default proposal. There is also a method that resamples
+# particles based on their weights, which serves to redistribute the particles
+# to more promising parts of the latent space.
 #
 # - `Gen.maybe_resample!`
 #
-# Gen also provides a method for sampling a collection of unweighted traces from the current weighted collection in the particle filter state:
+# Gen also provides a method for sampling a collection of unweighted traces
+# from the current weighted collection in the particle filter state:
 #
 # - `Gen.sample_unweighted_traces`
 
 function particle_filter(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
-    
+
     # construct initial observations
     init_obs = Gen.choicemap((:z0, zs[1]))
     state = Gen.initialize_particle_filter(model, (0,), init_obs, num_particles)
-    
+
     # steps
     for t=1:length(zs)-1
         Gen.maybe_resample!(state, ess_threshold=num_particles/2)
         obs = Gen.choicemap(((:z, t), zs[t+1]))
         Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
     end
-    
+
     # return a sample of unweighted traces from the weighted collection
     return Gen.sample_unweighted_traces(state, num_samples)
 end;
 
-# The initial state is obtained by providing the following to `initialize_particle_filter`:
+# The initial state is obtained by providing the following to
+# `initialize_particle_filter`:
 #
 # - The generative function for the generative model (`model`)
 #
 # - The initial arguments to the generative function.
 #
-# - The initial observations, expressed as a map from choice address to values (`init_obs`).
+# - The initial observations, expressed as a map from choice address to values
+#   (`init_obs`).
 #
 # - The number of particles.
 #
-# At each step, we resample from the collection of traces (`maybe_resample!`) and then we introduce one additional bearing measurement by calling `particle_filter_step!` on the state. We pass the following arguments to `particle_filter_step!`:
+# At each step, we resample from the collection of traces (`maybe_resample!`)
+# and then we introduce one additional bearing measurement by calling
+# `particle_filter_step!` on the state. We pass the following arguments to
+# `particle_filter_step!`:
 #
 # - The state (it will be mutated)
 #
-# - The new arguments to the generative function for this step. In our case, this is the number of measurements beyond the first measurement.
+# - The new arguments to the generative function for this step. In our case,
+#   this is the number of measurements beyond the first measurement.
 #
-# - The [argdiff](https://probcomp.github.io/Gen/dev/ref/gfi/#Argdiffs-1) value, which provides detailed information about the change to the arguments between the previous step and this step. We will revisit this value later. For now, we indicat ethat we do not know how the `T::Int` argument will change with each step.
+# - The [argdiff](https://probcomp.github.io/Gen/dev/ref/gfi/#Argdiffs-1)
+#   value, which provides detailed information about the change to the
+#   arguments between the previous step and this step. We will revisit this
+#   value later.  For now, we indicate that we do not know how the `T::Int`
+#   argument will change with each step.
 #
-# - The new observations associated with the new step. In our case, this just contains the latest measurement.
+# - The new observations associated with the new step. In our case, this just
+#   contains the latest measurement.
 #
 
-# We run this particle filter with 5000 particles, and return a sample of 100 particles. This will take 30-60 seconds. We will see one way of speeding up the particle filter in a later section.
+# We run this particle filter with 5000 particles, and return a sample of 100
+# particles. This will take 30-60 seconds. We will see one way of speeding up
+# the particle filter in a later section.
 
 @time pf_traces = particle_filter(5000, zs, 200);
 
-# To render these traces, we first define a function that overlays many renderings:
+# To render these traces, we first define a function that overlays many
+# renderings:
 
 function overlay(renderer, traces; same_data=true, args...)
     renderer(traces[1], show_data=true, args...)
     for i=2:length(traces)
         renderer(traces[i], show_data=!same_data, args...)
     end
+    xlabel("X")
+    ylabel("Y")
+    title("Observed bearings (red) and\npositions of individual traces (one color per trace)")
 end;
 
 # We then render the traces from the particle filter:
 
-overlay(render, pf_traces, same_data=true)
+overlay(render, pf_traces, same_data=true);
 
-# Notice that as during the period of denser bearing measurements, the trajectories tend to turn so that the heading is more parallel to the bearing vector. An alternative explanation is that the point maintained a constant heading, but just slowed down significantly. It is interesting to see that the inferences favor the "turning explanation" over the "slowing down explanation".
+# We see a broad posterior. Many paths (i.e. x- and y-positions) explain the observed
+# bearings.
+#
+#
+# Notice that as during the period of denser bearing measurements, the
+# trajectories tend to turn so that the heading is more parallel to the bearing
+# vector. An alternative explanation is that the point maintained a constant
+# heading, but just slowed down significantly. It is interesting to see that
+# the inferences favor the "turning explanation" over the "slowing down
+# explanation".
 
 # ----
 #
 # ### Exercise
 # Run the particle filter with fewer particles and visualize the results.
 
-# ### Solution
-#
-# ------
 
 # ## 3. Adding rejuvenation moves <a name="rejuv"></a>
 
-# It is sometimes useful to add MCMC moves to particles in a particle filter between steps. These MCMC moves are often called "rejuvenation moves" [4]. Each rejuvenation moves targets the *current posterior distribution* at the given step. For example, when applying the rejuvenation move after incorporating 3 observations, our rejuvenation moves have as their stationary distribution the conditional distribution on the latent variables, given the first three observations.
-#
-# Next, we write a version of the particle filter that applies two random walk Metropolis-Hastings rejuvenation move to each particle.
+# It is sometimes useful to add MCMC moves to particles in a particle filter
+# between steps. These MCMC moves are often called "rejuvenation moves" [4].
+# Each rejuvenation moves targets the *current posterior distribution* at the
+# given step. For example, when applying the rejuvenation move after
+# incorporating 3 observations, our rejuvenation moves have as their stationary
+# distribution the conditional distribution on the latent variables, given the
+# first three observations.
+#
+# Next, we write two new versions of the particle filter, each of which uses
+# Metropolis-Hastings rejuvenation moves to each particle.  The first version
+# uses so-called "resimulation MH," meaning that the proposal distribution for
+# MH is equal to the prior of the generative model.  The proposed next state
+# under this rejuvenation move is independent of the current state.  By
+# contrast, the second version we write will use Gaussian drift proposals, and
+# therefore we refer to it as "random walk MH."
+
+# First, the resimulation MH rejuvenation move:
+
+function particle_filter_rejuv_resim(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
+    init_obs = Gen.choicemap((:z0, zs[1]))
+    state = Gen.initialize_particle_filter(model, (0,), init_obs, num_particles)
+    for t=1:length(zs)-1
 
-# The cell below defines a Metropolis-Hastings perturbation move that perturbs the velocity vectors for a block of time steps between `a` and `b` inclusive.
+        # apply a rejuvenation move to each particle
+        for i=1:num_particles
+            # < Replace this default MH move with your perturbation move >
+            speed = select(:x0, :y0, :vx0, :vy0)
+            state.traces[i], _  = mh(state.traces[i], speed)
+        end
+
+        Gen.maybe_resample!(state, ess_threshold=num_particles/2)
+        obs = Gen.choicemap(((:z, t), zs[t+1]))
+        Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
+    end
+
+    # return a sample of unweighted traces from the weighted collection
+    return Gen.sample_unweighted_traces(state, num_samples)
+end;
+
+
+# Next, we define a Metropolis-Hastings perturbation move that perturbs the
+# velocity vectors for a block of time steps between `a` and `b` inclusive.
 
 # +
 @gen function perturbation_proposal(prev_trace, a::Int, b::Int)
@@ -252,53 +360,79 @@ function perturbation_move(trace, a::Int, b::Int)
 end;
 # -
 
-# We add this into our particle filtering inference program below. We apply the rejuvenation move to adjust the velocities for the previous 5 time steps.
+# We add this into our particle filtering inference program below. We apply the
+# rejuvenation move to adjust the velocities for the previous 5 time steps.
 
-function particle_filter_rejuv(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
-    init_obs = Gen.choicemap((:z0, zs[1]))    
+function particle_filter_rejuv_perturb(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
+    init_obs = Gen.choicemap((:z0, zs[1]))
     state = Gen.initialize_particle_filter(model, (0,), init_obs, num_particles)
     for t=1:length(zs)-1
-        
+
         # apply a rejuvenation move to each particle
         for i=1:num_particles
             state.traces[i], _ = perturbation_move(state.traces[i], max(1, t-5), t-1)
         end
-        
+
         Gen.maybe_resample!(state, ess_threshold=num_particles/2)
         obs = Gen.choicemap(((:z, t), zs[t+1]))
         Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
     end
-    
+
     # return a sample of unweighted traces from the weighted collection
     return Gen.sample_unweighted_traces(state, num_samples)
 end;
 
-# We run the particle filter with rejuvenation below. This will take a minute or two. We will see one way of speeding up the particle filter in a later section.
+# We run the particle filter with rejuvenation below. This will take a minute
+# or two. We will see one way of speeding up the particle filter in a later
+# section.
 
-@time pf_rejuv_traces = particle_filter_rejuv(5000, zs, 200);
+@time pf_rejuv_traces_resim = particle_filter_rejuv_resim(5000, zs, 200);
+
+@time pf_rejuv_traces_perturb = particle_filter_rejuv_perturb(5000, zs, 200);
 
 # We render the traces:
 
-overlay(render, pf_rejuv_traces, same_data=true)
+overlay(render, pf_rejuv_traces_resim, same_data=true)
+title("Rejuvenation with resimulation MH on the starting points")
+
+overlay(render, pf_rejuv_traces_perturb, same_data=true)
+title("Rejuvenation with perturbation proposal")
 
 # ## 4. Using the unfold combinator to improve performance <a name="unfold"></a>
 #
-# For the particle filtering algorithms above, within an update step it is only necessary to revisit the most recent state (or the most recent 5 states if the rejuvenation moves are used) because the initial states are never updated, and the contribution of these states to the weight computation cancel.
-#
-# However, each update step of the particle filter inference programs above scales *linearly* in the size of the trace because it visits every state when computing the weight update. This is because the built-in modeling DSL by default always performs an end-to-end execution of the generative function body whenever performing a trace update. This allows the built-in modeling DSL to be very flexible and to have a simple implementation, at the cost of performance. There are several ways of improving performance after one has a prototype written in the built-in modeling DSL. One of these is [Generative Function Combinators](https://probcomp.github.io/Gen/dev/ref/combinators/), which make the flow of information through the generative process more explicit to Gen, and enable asymptotically more efficient inference programs.
-#
-# To exploit the opportunity for incremental computation, and improve the scaling behavior of our particle filter inference programs, we will write a new model that replaces the following Julia `for` loop in our model, using a generative function combinator.
+# For the particle filtering algorithms above, within an update step it is only
+# necessary to revisit the most recent state (or the most recent 5 states if
+# the rejuvenation moves are used) because the initial states are never
+# updated, and the contribution of these states to the weight computation
+# cancel.
+#
+# However, each update step of the particle filter inference programs above
+# scales *linearly* in the size of the trace because it visits every state when
+# computing the weight update. This is because the built-in modeling DSL by
+# default always performs an end-to-end execution of the generative function
+# body whenever performing a trace update. This allows the built-in modeling
+# DSL to be very flexible and to have a simple implementation, at the cost of
+# performance. There are several ways of improving performance after one has a
+# prototype written in the built-in modeling DSL. One of these is [Generative
+# Function Combinators](https://probcomp.github.io/Gen/dev/ref/combinators/),
+# which make the flow of information through the generative process more
+# explicit to Gen, and enable asymptotically more efficient inference programs.
+#
+# To exploit the opportunity for incremental computation, and improve the
+# scaling behavior of our particle filter inference programs, we will write a
+# new model that replaces the following Julia `for` loop in our model, using a
+# generative function combinator.
 #
 # ```julia
 #     # generate successive states and measurements
 #     for t=1:T
-#         
+#
 #         # update the state of the point
 #         vx = @trace(normal(vx, sqrt(velocity_var)), (:vx, t))
 #         vy = @trace(normal(vy, sqrt(velocity_var)), (:vy, t))
 #         x += vx
 #         y += vy
-#         
+#
 #         # bearing measurement
 #         @trace(normal(bearing(x, y), measurement_noise), (:z, t))
 #
@@ -308,9 +442,19 @@ overlay(render, pf_rejuv_traces, same_data=true)
 #     end
 # ```
 #
-# This `for` loop has a very specific pattern of information flow---there is a sequence of states (represented by `x, y, vx, and vy), and each state is generated from the previous state. This is exactly the pattern that the [Unfold](https://probcomp.github.io/Gen/dev/ref/combinators/#Unfold-combinator-1) generative function combinator is designed to handle.
+# This `for` loop has a very specific pattern of information flow---there is a
+# sequence of states (represented by `x, y, vx, and vy), and each state is
+# generated from the previous state. This is exactly the pattern that the
+# [Unfold](https://probcomp.github.io/Gen/dev/ref/combinators/#Unfold-combinator-1)
+# generative function combinator is designed to handle.
 #
-# Below, we re-express the Julia for loop over the state sequence using the Unfold combinator. Specifically, we define a generative function (kernel) that takes the prevous state as its second argument, and returns the new state. The Unfold combinator takes the kernel and returns a new generative function (chain) that applies kernel repeatedly. Read the Unfold combinator documentation for details on the behavior of the resulting generative function (chain).
+# Below, we re-express the Julia for loop over the state sequence using the
+# Unfold combinator. Specifically, we define a generative function (kernel)
+# that takes the prevous state as its second argument, and returns the new
+# state. The Unfold combinator takes the kernel and returns a new generative
+# function (chain) that applies kernel repeatedly. Read the Unfold combinator
+# documentation for details on the behavior of the resulting generative
+# function (chain).
 
 # +
 struct State
@@ -336,16 +480,18 @@ chain = Gen.Unfold(kernel)
 Gen.load_generated_functions()
 # -
 
-# We can understand the behavior of `chain` by getting a trace of it and printing the random choices:
+# We can understand the behavior of `chain` by getting a trace of it and
+# printing the random choices:
 
 trace = Gen.simulate(chain, (4, State(0., 0., 0., 0.), 0.01, 0.01))
 println(Gen.get_choices(trace))
 
-# We now write a new version of the generative model that invokes `chain` instead of using the Julia `for` loop:
+# We now write a new version of the generative model that invokes `chain`
+# instead of using the Julia `for` loop:
 
 # +
 @gen (static) function unfold_model(T::Int)
-    
+
     # parameters
     measurement_noise = 0.005
     velocity_var = 1e-6
@@ -361,10 +507,10 @@ println(Gen.get_choices(trace))
 
     # record initial state
     init_state = State(x, y, vx, vy)
-    
+
     # run `chain` function under address namespace `:chain`, producing a vector of states
     states = @trace(chain(T, init_state, velocity_var, measurement_noise), :chain)
-    
+
     result = (init_state, states)
     return result
 end;
@@ -378,15 +524,15 @@ Gen.load_generated_functions()
 println(Gen.get_choices(trace))
 
 function unfold_particle_filter(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
-    init_obs = Gen.choicemap((:z0, zs[1]))    
-    state = Gen.initialize_particle_filter(unfold_model, (0,), init_obs, num_particles)    
+    init_obs = Gen.choicemap((:z0, zs[1]))
+    state = Gen.initialize_particle_filter(unfold_model, (0,), init_obs, num_particles)
     for t=1:length(zs)-1
 
         maybe_resample!(state, ess_threshold=num_particles/2)
         obs = Gen.choicemap((:chain => t => :z, zs[t+1]))
         Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
     end
-    
+
     # return a sample of traces from the weighted collection:
     return Gen.sample_unweighted_traces(state, num_samples)
 end;
@@ -422,10 +568,14 @@ end;
 
 overlay(unfold_render, unfold_pf_traces, same_data=true)
 
-# We now empirically investigate the scaling behavior of (1) the inference program that uses the Julia `for` loop,  and (2) the equivalent inference program that uses Unfold. We will use a fake long vector of z data, and we will investigate how the running time depends on the number of observations.
+# We now empirically investigate the scaling behavior of (1) the inference
+# program that uses the Julia `for` loop,  and (2) the equivalent inference
+# program that uses Unfold. We will use a synthetic long vector of z data, and
+# we will investigate how the running time depends on the number of
+# observations.
 
 # +
-fake_zs = rand(1000);
+synthetic_zs = rand(1000);
 
 function timing_experiment(num_observations_list::Vector{Int}, num_particles::Int, num_samples::Int)
     times = Vector{Float64}()
@@ -433,13 +583,13 @@ function timing_experiment(num_observations_list::Vector{Int}, num_particles::In
     for num_observations in num_observations_list
         println("evaluating inference programs for num_observations: $num_observations")
         tstart = time_ns()
-        traces = particle_filter(num_particles, fake_zs[1:num_observations], num_samples)
+        traces = particle_filter(num_particles, synthetic_zs[1:num_observations], num_samples)
         push!(times, (time_ns() - tstart) / 1e9)
-        
+
         tstart = time_ns()
-        traces = unfold_particle_filter(num_particles, fake_zs[1:num_observations], num_samples)
+        traces = unfold_particle_filter(num_particles, synthetic_zs[1:num_observations], num_samples)
         push!(times_unfold, (time_ns() - tstart) / 1e9)
-        
+
     end
     (times, times_unfold)
 end;
@@ -448,7 +598,9 @@ num_observations_list = [1, 3, 10, 30, 50, 100, 150, 200, 500]
 (times, times_unfold) = timing_experiment(num_observations_list, 100, 20);
 # -
 
-# Notice that the running time of the inference program without unfold appears to be quadratic in the number of observations, whereas the inference program that uses unfold appears to scale linearly:
+# Notice that the running time of the inference program without unfold appears
+# to be quadratic in the number of observations, whereas the inference program
+# that uses unfold appears to scale linearly:
 
 plot(num_observations_list, times, color="blue")
 plot(num_observations_list, times_unfold, color="red")