diff --git a/code/kl_divergences.py b/code/kl_divergences.py
index c59b295..f79c9fc 100644
--- a/code/kl_divergences.py
+++ b/code/kl_divergences.py
@@ -10,4 +10,4 @@ def diagonal_gaussian_kl(mean, std):
     :return: The KL divergence.
     """
     var = std ** 2
-    return 0.5 * (mx.sym.sum(1 + mx.sym.log(var) - mean ** 2 - var))
+    return -0.5 * (mx.sym.sum(1 + mx.sym.log(var) - mean ** 2 - var, axis=1))
diff --git a/code/run_vae.py b/code/run_vae.py
index 6c0cd2f..5666681 100644
--- a/code/run_vae.py
+++ b/code/run_vae.py
@@ -12,7 +12,7 @@
 DEFAULT_LEARNING_RATE = 0.0003
 
 data_names = ['train', 'valid', 'test']
-train_set = ['train', 'valid']
+train_set = ['train']
 test_set = ['test']
 data_dir = join(os.curdir, "binary_mnist")
 
@@ -86,8 +86,6 @@ def train_model(generator_layers: List[int],
     mnist = load_data(train=True, logger=logger)
     train_iter = mx.io.NDArrayIter(data=mnist['train'], data_name="data", label=mnist["train"], label_name="label",
                                    batch_size=batch_size, shuffle=True)
-    val_iter = mx.io.NDArrayIter(data=mnist['valid'], data_name="data", label=mnist['valid'], label_name="label",
-                                 batch_size=batch_size)
 
     vae = construct_vae(latent_type="gaussian", likelihood="bernoulliProd", generator_layer_sizes=generator_layers,
                         infer_layer_size=inference_layers, latent_variable_size=latent_size,
@@ -95,16 +93,85 @@ def train_model(generator_layers: List[int],
 
     module = mx.module.Module(vae.train(mx.sym.Variable("data"), mx.sym.Variable('label')),
                               data_names=[train_iter.provide_data[0][0]],
-                              label_names=["label"], context=ctx,
+                              label_names=[train_iter.provide_label[0][0]], context=ctx,
                               logger=logger)
 
     logger.info("Starting to train")
+
+    # module.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label,
+    #           for_training=True, force_rebind=True)
+    # # if monitor is not None:
+    # #     self.install_monitor(monitor)
+    # module.init_params()
+    # module.init_optimizer(optimizer=optimiser, optimizer_params={"learning_rate" : learning_rate})
+    #
+    # # if not isinstance(eval_metric, metric.EvalMetric):
+    # #     eval_metric = metric.create(eval_metric)
+    #
+    # ################################################################################
+    # # training loop
+    # ################################################################################
+    # import time
+    # for epoch in range(0, epochs):
+    #     tic = time.time()
+    #     #eval_metric.reset()
+    #     nbatch = 0
+    #     data_iter = iter(train_iter)
+    #     end_of_batch = False
+    #     next_data_batch = next(data_iter)
+    #     while not end_of_batch:
+    #         data_batch = next_data_batch
+    #         # if monitor is not None:
+    #         #     monitor.tic()
+    #         module.forward_backward(data_batch)
+    #         module.update()
+    #         try:
+    #             # pre fetch next batch
+    #             next_data_batch = next(data_iter)
+    #             module.prepare(next_data_batch)
+    #         except StopIteration:
+    #             end_of_batch = True
+    #
+    #         # self.update_metric(eval_metric, data_batch.label)
+    #
+    #         # if monitor is not None:
+    #         #     monitor.toc_print()
+    #
+    #         # if batch_end_callback is not None:
+    #         #     batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
+    #         #                                      eval_metric=eval_metric,
+    #         #                                      locals=locals())
+    #         #     for callback in _as_list(batch_end_callback):
+    #         #         callback(batch_end_params)
+    #         nbatch += 1
+    #         print(nbatch)
+    #
+    #     # one epoch of training is finished
+    #     # for name, val in eval_metric.get_name_value():
+    #     #     module.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
+    #     toc = time.time()
+    #     module.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))
+    #
+    #     # sync aux params across devices
+    #     arg_params, aux_params = module.get_params()
+    #     module.set_params(arg_params, aux_params)
+    #
+    #     # if epoch_end_callback is not None:
+    #     #     for callback in _as_list(epoch_end_callback):
+    #     #         callback(epoch, self.symbol, arg_params, aux_params)
+    #
+    #     # end of 1 epoch, reset the data-iter for another epoch
+    #     train_iter.reset()
+
+    # print(module.get_outputs())
+    #
     module.fit(train_data=train_iter, optimizer=optimiser, force_init=True, force_rebind=True, num_epoch=epochs,
                optimizer_params={'learning_rate': learning_rate},
                # validation_metric=mx.metric.Perplexity(None),
                # eval_data=val_iter,
-               batch_end_callback=mx.callback.Speedometer(frequent=1, batch_size=batch_size),
-               epoch_end_callback=mx.callback.do_checkpoint('vae'))
+               batch_end_callback=mx.callback.Speedometer(frequent=20, batch_size=batch_size),
+               epoch_end_callback=mx.callback.do_checkpoint('vae'),
+               eval_metric="Loss")
 
 
 def load_model(model_file: str):
diff --git a/code/vae.py b/code/vae.py
index 61c18ce..5d6b056 100644
--- a/code/vae.py
+++ b/code/vae.py
@@ -57,12 +57,12 @@ def _preactivation(self, latent_state: mx.sym.Symbol) -> mx.sym.Symbol:
         :return: The pre-activation before output activation
         """
         prev_out = None
-        for i, hidden in enumerate(self.layer_sizes):
-            fc_i = mx.sym.FullyConnected(data=latent_state, num_hidden=hidden, name="gen_fc_{}".format(i))
+        for i, size in enumerate(self.layer_sizes):
+            fc_i = mx.sym.FullyConnected(data=latent_state, num_hidden=size, name="gen_fc_{}".format(i))
             prev_out = mx.sym.Activation(data=fc_i, act_type=self.act_type, name="gen_act_{}".format(i))
 
         # The output layer that gives pre_activations for multiple Bernoulli softmax between 0 and 1
-        fc_out = mx.sym.FullyConnected(data=prev_out, num_hidden=2 * self.data_dims, name="gen_fc_out")
+        fc_out = mx.sym.FullyConnected(data=prev_out, num_hidden=self.data_dims, name="gen_fc_out")
 
         return fc_out
 
@@ -73,11 +73,9 @@ def generate_sample(self, latent_state: mx.sym.Symbol) -> mx.sym.Symbol:
         :param latent_state: The input latent state.
         :return: A vector of Bernoulli draws.
         """
-        act = mx.sym.Activation(data=self._generate(latent_state=latent_state), act_type=self.output_act,
+        act = mx.sym.Activation(data=self._preactivation(latent_state=latent_state), act_type=self.output_act,
                                 name="gen_act_out")
-        act = mx.ndarray(mx.sym.split(data=act, num_outputs=self.data_dims))
-        out = mx.sym.maximum(data=act, axis=0)
-
+        out = act > 0.5
         return out
 
     def train(self, latent_state=mx.sym.Symbol, label=mx.sym.Symbol) -> mx.sym.Symbol:
@@ -88,9 +86,9 @@ def train(self, latent_state=mx.sym.Symbol, label=mx.sym.Symbol) -> mx.sym.Symbo
         :param label: A binary vector (same as input for inference module)
         :return: The loss symbol used for training
         """
-        output = self._preactivation(latent_state=latent_state)
-        output = mx.sym.reshape(data=output, shape=(-1, 2, self.data_dims))
-        return mx.sym.SoftmaxOutput(data=output, label=label, multi_output=True)
+        output = mx.sym.Activation(data=self._preactivation(latent_state=latent_state), act_type=self.output_act,
+                                   name="output_act")
+        return mx.sym.sum(label * mx.sym.log(output) + (1-label) * mx.sym.log(1-output), axis=[1])
 
 
 class InferenceNetwork(ABC):
@@ -235,8 +233,9 @@ def train(self, data: mx.sym.Symbol, label: mx.sym.Symbol) -> mx.sym.Symbol:
         """
         mean, std = self.inference_net.inference(data=data)
         latent_state = self.inference_net.sample_latent_state(mean, std)
-        kl_loss = mx.sym.MakeLoss(self.kl_divergence(mean, std))
-        return self.generator.train(latent_state=latent_state, label=label)
+        kl_term = self.kl_divergence(mean, std)
+        log_likelihood = self.generator.train(latent_state=latent_state, label=label)
+        return mx.sym.MakeLoss(kl_term - log_likelihood, name="Neg_Elbo")
 
     def generate_reconstructions(self, data: mx.sym.Symbol, n: int) -> mx.sym.Symbol:
         """
diff --git a/code/vae_notebook.ipynb b/code/vae_notebook.ipynb
index cfe67fb..ef188f6 100644
--- a/code/vae_notebook.ipynb
+++ b/code/vae_notebook.ipynb
@@ -23,7 +23,7 @@
    "source": [
     "# The Framework\n",
     "\n",
-    "For the purpose of this tutorial we are going to use [mxnet](https://mxnet.incubator.apache.org) which is a scalable deep learning library that has interfaces for several languages, including python. We are going to import and abbreviate it as \"mx\". We will use mxnet to define computation graph. This is done using the [symbol library](https://mxnet.incubator.apache.org/api/python/symbol.html). When building the VAE, all the methods that you use should be prefixed with `mx.sym`."
+    "For the purpose of this tutorial we are going to use [mxnet](https://mxnet.incubator.apache.org) which is a scalable deep learning library that has interfaces for several languages, including python. We are going to import and abbreviate it as \"mx\". We will use mxnet to define a computation graph. This is done using the [symbol library](https://mxnet.incubator.apache.org/api/python/symbol.html). When building the VAE, all the methods that you use should be prefixed with `mx.sym`."
    ]
   },
   {
@@ -37,14 +37,15 @@
     "import os, logging, sys\n",
     "from os.path import join, exists\n",
     "from abc import ABC\n",
-    "from typing import List, Tuple, Callable"
+    "from typing import List, Tuple, Callable\n",
+    "from numpy import genfromtxt"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Next, we specify a couple of variables that will help us to load the data."
+    "Next, we specify a couple of constants that will help us to load the data."
    ]
   },
   {
@@ -55,11 +56,15 @@
    "source": [
     "DEFAULT_LEARNING_RATE = 0.0003\n",
     "\n",
-    "data_names = ['train', 'valid', 'test']\n",
-    "train_set = ['train', 'valid']\n",
-    "test_set = ['test']\n",
+    "TRAIN_SET = 'train'\n",
+    "VALID_SET = 'valid'\n",
+    "TEST_SET = 'test'\n",
+    "data_names = [TRAIN_SET, VALID_SET, TEST_SET]\n",
+    "test_set = [TEST_SET]\n",
     "data_dir = join(os.curdir, \"binary_mnist\")\n",
-    "data_names = ['train', 'valid', 'test']"
+    "\n",
+    "# change this to mx.gpu(0) if you want to run your code on gpu\n",
+    "ctx = mx.cpu()"
    ]
   },
   {
@@ -75,7 +80,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "logging.basicConfig(level=logging.DEBUG, format=\"%(asctime)s [%(levelname)s]: %(message)s\")"
+    "logging.basicConfig(level=logging.DEBUG, format=\"%(asctime)s [%(levelname)s]: %(message)s\", datefmt=\"%H:%M:%S\")"
    ]
   },
   {
@@ -121,7 +126,7 @@
    "source": [
     "# Diagonal Gaussian VAE\n",
     "\n",
-    "The most basic VAE model is one where we assume that the latent variable is multiviariate Gaussian. We fix the prior to be standard normal. During inference, we use a multivariate Gaussian variational distribution with diagonal covariance matrix. This means that we are only modelling variance but not covariance (in fact, a k-dimensional Guassian with diagonal covariance has the same density as a product of k independent univariate Gaussians). Geometrically, this variational distribution can only account for spherical but not for eliptical densities. It is thus rather limited in its modelling capabilities. Still, because it uses a neural network under the hood, it is very expressive. \n",
+    "The most basic VAE model is one where we assume that the latent variable is multiviariate Gaussian. We fix the prior to be standard normal. During inference, we use a multivariate Gaussian variational distribution with diagonal covariance matrix. This means that we are only modelling variance but not covariance (in fact, a k-dimensional Guassian with diagonal covariance has the same density as a product of k independent univariate Gaussians). Geometrically, this variational distribution can only account for spherical but not for elliptical densities. It is thus rather limited in its modelling capabilities. Still, because it uses a neural network under the hood, it is very expressive. \n",
     "\n",
     "In this tutorial, we will model the mist binarised digit data set. Each image is encoded as a 784-dimensional vector. We will model each of these vectors as a product of 784 Bernoullis (of course, there are better models but we want to keep it simple). Our likelihood is thus a product of independent Bernoullis. The resulting model is formally specified as \n",
     "\n",
@@ -140,7 +145,7 @@
     "\n",
     "We will spread our implementation across 3 classes. This design choice is motivated by the desire to make our models as modular as possible. This will later allow us to mix and match different likelihoods and variational distributions.\n",
     "\n",
-    "* **Generator**: This class defines our likelihood. Given a latent value, it will can produce a data sample our assign a density to an existing data point.\n",
+    "* **Generator**: This class defines our likelihood. Given a latent value, it produces a data sample our assign a density to an existing data point.\n",
     "* **InferenceNetwork**: This neural network computes the parameters of the variational approximation from a data point.\n",
     "* **VAE**: This is the variational autoencoder. It combines a Generator and an InferenceNetwork and trains them jointly. Once trained, it can generate random data points or try to reproduce data presented to it.\n",
     "\n",
@@ -257,8 +262,7 @@
    "metadata": {},
    "source": [
     "## Exercise 1\n",
-    "Let us start by implementing the generator. This is pretty much a standard neural network. The main point of this exercise is to get you comfortable with mxnet. Complete all the TODOs below. Before starting, check the activation\n",
-    "functions available in mxnet [here](https://mxnet.incubator.apache.org/api/python/symbol.html#mxnet.symbol.Activation)."
+    "Let us start by implementing the generator. This is pretty much a standard neural network. The main point of this exercise is to get you comfortable with mxnet. Complete all the TODOs below. Before starting, check mxnet's [fully connected layer](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.FullyConnected) and [activation functions](https://mxnet.incubator.apache.org/api/python/symbol.html#mxnet.symbol.Activation)."
    ]
   },
   {
@@ -288,8 +292,8 @@
     "        :param latent_state: The input latent state\n",
     "        :return: The pre-activation before output activation\n",
     "        \"\"\"\n",
-    "        prev_out = None\n",
-    "        for i, hidden in enumerate(self.layer_sizes):\n",
+    "        prev_out = latent_state\n",
+    "        for i, size in enumerate(self.layer_sizes):\n",
     "            # TODO for each layer size in layer_sizes, introduce a layer that performs an affine transformation \n",
     "            # (implemented by mx.sym.FullyConnected) followed by a non-linearity that is specified by self.act_type.\n",
     "            # Call the resulting symbol act_i and uncomment the line below\n",
@@ -303,7 +307,7 @@
     "        # distribution as a 2-dim probability vector. (At this point, the vector does not contain\n",
     "        # probabilities yet, but the corresponding log-odds.)\n",
     "        # Uncomment the lines below whe you have implemented the layers from layer_sizes\n",
-    "        #fc_out = mx.sym.FullyConnected(data=prev_out, num_hidden=2 * self.data_dims, name=\"gen_fc_out\")\n",
+    "        #fc_out = mx.sym.FullyConnected(data=prev_out, num_hidden=self.data_dims, name=\"gen_fc_out\")\n",
     "        #\n",
     "        #return fc_out\n",
     "        \n",
@@ -318,11 +322,9 @@
     "        :param latent_state: The input latent state.\n",
     "        :return: A vector of Bernoulli draws.\n",
     "        \"\"\"\n",
-    "        act = mx.sym.Activation(data=self._generate(latent_state=latent_state), act_type=self.output_act,\n",
+    "        act = mx.sym.Activation(data=self._preactivation(latent_state=latent_state), act_type=self.output_act,\n",
     "                                name=\"gen_act_out\")\n",
-    "        act = mx.ndarray(mx.sym.split(data=act, num_outputs=self.data_dims))\n",
-    "        out = mx.sym.maximum(data=act, axis=0)\n",
-    "\n",
+    "        out = act > 0.5\n",
     "        return out\n",
     "\n",
     "    def train(self, latent_state=mx.sym.Symbol, label=mx.sym.Symbol) -> mx.sym.Symbol:\n",
@@ -333,12 +335,9 @@
     "        :param label: A binary vector (same as input for inference module)\n",
     "        :return: The loss symbol used for training\n",
     "        \"\"\"\n",
-    "        output = self._preactivation(latent_state=latent_state)\n",
-    "        output = mx.sym.reshape(data=output, shape=(-1, 2, self.data_dims))\n",
-    "        # We use a multi-ouput softmax. This is computes gradients for 784 independent softmaxes.\n",
-    "        # Since a softmax of a 2-dim vector is a logistic function, we get the likelihood (and gradients)\n",
-    "        # for 784 independent Bernoulli distributions as desired.\n",
-    "        return mx.sym.SoftmaxOutput(data=output, label=label, multi_output=True)"
+    "        output = mx.sym.Activation(data=self._preactivation(latent_state=latent_state), act_type=self.output_act,\n",
+    "                                   name=\"output_act\")\n",
+    "        return mx.sym.sum(label * mx.sym.log(output) + (1-label) * mx.sym.log(1-output), axis=[1])"
    ]
   },
   {
@@ -350,10 +349,10 @@
     "We now move on to the inference network. Recall that this network will return the parameters of a diagonal Gaussian. Thus, we need to return to vectors of the same size: a mean and a standard deviation vector. (Formally, the parameters of the Gaussian are the variances. However, from the derivation of the Gaussian reparametrisation we know that we\n",
     "need the standard deviations to generate a Gaussian random variable $z$ as transformation of a standard Gaussian variable $\\epsilon$.)\n",
     "\n",
-    "**Hint:** In this exercise you will need to draw a random Gaussian sample (see [here](https://mxnet.incubator.apache.org/api/python/symbol.html#mxnet.symbol.random_normal)). The operator requires are\n",
+    "**Hint:** In this exercise you will need to draw a random Gaussian sample (see [here](https://mxnet.incubator.apache.org/api/python/symbol.html#mxnet.symbol.random_normal)). The operator requires a\n",
     "shape whose first entry is the batch size. The batch size is not known to you during implementation, however.\n",
     "You can leave it underspecified by choosing $0$ as a value. When you combine the sampling operator with another\n",
-    "operator immediately, mxnet will infer the correct the batch size for you."
+    "operator mxnet will infer the correct the batch size for you."
    ]
   },
   {
@@ -421,7 +420,7 @@
    "source": [
     "# Exercise 3.a\n",
     "\n",
-    "Finally, we will put it all together and build our VAE. Recall that the objective for the inference net contains a KL term. You will need to implement that KL-term. Once it is implemented, we can take advantage of autograd to get its gradients."
+    "Finally, we will put it all together and build our VAE. Recall that the objective for the inference net contains a KL term. You will need to implement that KL-term. Once it is implemented, we can take advantage of autograd to get its gradients. Use the [log](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.log) and [sum](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.sum) symbols here."
    ]
   },
   {
@@ -441,7 +440,9 @@
    "source": [
     "# Exercise 3.b\n",
     "\n",
-    "The only thing that is left to do is to implement VAE training. This is where you will have to define an additional loss with respect to the KL divergence for the inference net. In mxnet losses are defined using the [MakeLoss](https://mxnet.incubator.apache.org/api/python/symbol.html#mxnet.symbol.MakeLoss) symbol."
+    "The only thing that is left to do is to implement VAE training. To train our VAE we will maximise the ELBO:\n",
+    "$$ \\mathbb{E}\\left[ \\log p(x|z) \\right] - \\text{KL}(q(z)||p(z)) \\ . $$\n",
+    "Recall that we assume that $ p(z) = \\mathcal{N}(z;0,I) $. Mxnet's optimisers minimise losses instead of maximising objectives. We thus turn the ELBO into a loss by taking its negative. Losses in mxnet are defined using the [MakeLoss](https://mxnet.incubator.apache.org/api/python/symbol.html#mxnet.symbol.MakeLoss) symbol."
    ]
   },
   {
@@ -506,9 +507,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Running the VAE\n",
+    "# Constructing a VAE\n",
     "\n",
-    "We have now fully specified a VAE. To get an impression of what its computation graph looks like, we can use mxnet's built-in visualisation. "
+    "We have now all the code for VAEs in place. Below, we have defined a factory method that makes it easier for you to play with different architectures."
    ]
   },
   {
@@ -553,6 +554,15 @@
     "        raise Exception(\"{} is an invalid latent variable type.\".format(latent_type))\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exercise 4\n",
+    "\n",
+    "Your turn! Construct your own VAE below."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -560,10 +570,17 @@
    "outputs": [],
    "source": [
     "vae = construct_vae(latent_type=\"gaussian\", likelihood=\"bernoulliProd\", generator_layer_sizes=[200,500],\n",
-    "                   infer_layer_sizes=[500,200], latent_variable_size=100, data_dims=784, generator_act_type=\"tanh\",\n",
+    "                   infer_layer_sizes=[500,200], latent_variable_size=200, data_dims=784, generator_act_type=\"tanh\",\n",
     "                   infer_act_type=\"tanh\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To check what your VAE looks like we will visualise it. The variables are \"data\" and \"label\" are unbound variables in the computation graph. We will shortly use them to supply data to the model."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -572,7 +589,91 @@
    "source": [
     "data = mx.sym.Variable(\"data\")\n",
     "label = mx.sym.Variable(\"label\")\n",
-    "mx.viz.plot_network(vae.train(data, label), title=\"bla.jpg\", save_format='jpg')"
+    "mx.viz.plot_network(vae.train(data, label))\n",
+    "# Comment the line above and uncomment the one below if you want to save the VAE picture on disk\n",
+    "#mx.viz.plot_network(vae.train(data, label), title=\"my_vae\", save_format=\"pdf\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training a VAE\n",
+    "\n",
+    "We are now set to train the VAE. In order to do so we have to define a data iterator and a module in mxnet. The data iterator takes care of batching the data while the module executes the computation graph and updates the model parameters. For the purpose of training the model, we only need the training data which we load from disk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mnist = {}\n",
+    "file_name = join(data_dir, \"binary_mnist.{}\".format(TRAIN_SET))\n",
+    "logging.info(\"Reading {} into memory\".format(file_name))\n",
+    "mnist[TRAIN_SET] = mx.nd.array(genfromtxt(file_name))\n",
+    "logging.info(\"{} contains {} data points\".format(file_name, mnist[TRAIN_SET].shape[0]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Using that data we define the training set iterator. At this point we also need to choose a batch_size that will then be used in training. Notice that we randomise the order in which the data points are presented in the training set. It is important that `data_name` and `label_name` match the names of the unbound variables in our model. Mxnet will use this information to pass the correct data points to the variables. Finally, notice that the data and labels are identical. This is because the VAE tries to reconstruct its input data and thus labels and data are the same."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 200\n",
+    "train_iter = mx.io.NDArrayIter(data=mnist[TRAIN_SET], data_name=\"data\", label=mnist[TRAIN_SET], label_name=\"label\",\n",
+    "                                   batch_size=batch_size, shuffle=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can define a module that will do the training for us."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vae_module = mx.module.Module(vae.train(data=mx.sym.Variable(\"data\"), label=mx.sym.Variable(\"label\")),\n",
+    "                           data_names=[train_iter.provide_data[0][0]],\n",
+    "                           label_names=[train_iter.provide_label[0][0]], context=ctx, logger=logging)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Training the vae (or any other network defined in mxnet) is most easily done using the fit method. Here we choose to train our model for 20 epochs. Our optimiser will be adam. Training will take some time (2-5 minutes depending on your machine). We are keeping track of the loss (the negative ELBO) to see how the model develops. Notice that this loss\n",
+    "is not the actual negative ELBO but a _doubly stochastic approximation_ to it (see [here](https://projecteuclid.org/download/pdf_1/euclid.aoms/1177729586) and [here](http://proceedings.mlr.press/v32/titsias14.pdf)). The two sources of stochasticity are the mini-batches (the ELBO is defined with respect to the entire data set) and the reparametrisation trick (we approximate the integral over $ z $ through sampling). Both sources of stochasticity leave the approximation unbiased, however. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "epochs = 20\n",
+    "optimiser = \"adam\"\n",
+    "\n",
+    "vae_module.fit(train_data=train_iter, optimizer=optimiser, force_init=True, force_rebind=True, num_epoch=epochs,\n",
+    "               optimizer_params={'learning_rate': DEFAULT_LEARNING_RATE},\n",
+    "               batch_end_callback=mx.callback.Speedometer(frequent=20, batch_size=batch_size),\n",
+    "               epoch_end_callback=mx.callback.do_checkpoint('vae'),\n",
+    "               eval_metric=\"Loss\")"
    ]
   },
   {
diff --git a/modules/M6_DeepGenerativeModels/M6_DeepGenerativeModels.pdf b/modules/M6_DeepGenerativeModels/M6_DeepGenerativeModels.pdf
index b3a51da..310c88d 100644
Binary files a/modules/M6_DeepGenerativeModels/M6_DeepGenerativeModels.pdf and b/modules/M6_DeepGenerativeModels/M6_DeepGenerativeModels.pdf differ
diff --git a/modules/M6_DeepGenerativeModels/M6_DeepGenerativeModels.tex b/modules/M6_DeepGenerativeModels/M6_DeepGenerativeModels.tex
index 60b582e..7e42abf 100644
--- a/modules/M6_DeepGenerativeModels/M6_DeepGenerativeModels.tex
+++ b/modules/M6_DeepGenerativeModels/M6_DeepGenerativeModels.tex
@@ -518,8 +518,8 @@ \section{This is how we do: Variational Autoencoders}
 \pause
 Analytical computation of $ -\KL{q(z|x,\lambda)}{p(z)} $:
 \begin{equation*}
--\frac{1}{2}\sum_{i=1}^{N}\left(1 + \loga{\sigma^{2}_{i}} - 
-\mu_i - \sigma^{2}_{i} \right)
+\frac{1}{2}\sum_{i=1}^{N}\left(1 + \loga{\sigma^{2}_{i}} -
+\mu^{2}_{i} - \sigma^{2}_{i} \right)
 \end{equation*}
 \end{frame}
 
diff --git a/notes/.DS_Store b/notes/.DS_Store
new file mode 100644
index 0000000..025cd46
Binary files /dev/null and b/notes/.DS_Store differ
diff --git a/notes/ExpFamEntropyKL/ExpFamEntropyKL.pdf b/notes/ExpFamEntropyKL/ExpFamEntropyKL.pdf
new file mode 100644
index 0000000..ea3f285
Binary files /dev/null and b/notes/ExpFamEntropyKL/ExpFamEntropyKL.pdf differ
diff --git a/notes/ExpFamEntropyKL/ExpFamEntropyKL.tex b/notes/ExpFamEntropyKL/ExpFamEntropyKL.tex
new file mode 100644
index 0000000..91f8151
--- /dev/null
+++ b/notes/ExpFamEntropyKL/ExpFamEntropyKL.tex
@@ -0,0 +1,121 @@
+\documentclass[a4paper, 11pt]{article}
+
+\usepackage{amsmath, amssymb, hyperref, ../../vimacros}
+\hypersetup{breaklinks=true, colorlinks=true, linkcolor=blue, urlcolor=blue, citecolor=blue}
+
+\author{Philip Schulz and Wilker Aziz}
+\title{Exponential Family Entropy and KL divergence}
+\date{last modified: \today}
+
+\begin{document}
+
+\maketitle
+
+\begin{abstract}
+This note shows how to compute the entropy and KL divergence (and by extension many other information-theoretic quantities) of 
+exponential family distributions. We use the univariate Gaussian distribution as a running example and encourage the reader to
+apply the learned techniques to other distributions in exercises. We expect the reader to be familiar with exponential families 
+and some of their properties (such as moment computation).
+\end{abstract}
+
+\section{Exponential Family Form}
+
+Most exponential family distributions are standardly written in what is called their canonical form. 
+In order to apply results proved for 
+exponential families to them, it is often convenient to rewrite them in their natural parametrisation. The general way of doing
+this is to apply the identity function $ \exp(\log(\cdot)) $ to them and reorder terms. Before we do so for the Gaussian, let us
+introduce the notation we will use for exponential families throughout.
+\begin{equation} \label{eq:expFam}
+p(x|\overbrace{\theta}^{\substack{\text{canonical} \\ \text{parameters}}}) = \overbrace{h(x)}^{\substack{\text{base} \\ \text{measure}}} \exp\left( \underbrace{\eta(\theta)^{\top}}_{\substack{\text{natural} \\ \text{parameters}}} \times \overbrace{t(x)}^{\substack{\text{sufficient} \\ \text{statistics}}}  -\underbrace{A(\eta(\theta))}_{\text{log-normalizer}} \right)
+\end{equation}
+By default we assume that all terms above are vector-valued. Notice that the canonical parameters may be a function of any conditioning
+context and thus Equation~\eqref{eq:expFam} covers marginal as well as conditional distributions.
+
+\paragraph{Gaussian Distribution} We first write the univariate Gaussian distribution 
+with mean $ \mu $ and variance $ \sigma^{2} $ in its 
+canonical form. In that case we have $ \theta^{\top} = \begin{bmatrix} \mu & \sigma^{2} \end{bmatrix} $.
+\begin{equation}\label{eq:canonicalGaussian}
+p(x|\mu, \sigma^{2}) = \frac{1}{\sqrt{2\pi}\sigma} \exp \left(-\frac{1}{2}\left(\frac{x - \mu}{\sigma}\right)^{2}\right)
+\end{equation}
+
+Now we apply the identity function to transform this density into its exponential family form.
+\begin{subequations}
+\begin{align}
+p(x|\mu, \sigma^{2}) &= \exp \left( \log \left( \frac{1}{\sqrt{2\pi}\sigma} \exp \left(-\frac{1}{2}\left(\frac{x - \mu}{\sigma}\right)^{2}\right) \right) \right) \\
+&= \exp \left( \log \left( \frac{1}{\sqrt{2\pi}} \right) - \log \sigma -\frac{1}{2}\frac{(x - \mu)^{2}}{\sigma^{2}}\right) \\
+&= \frac{1}{\sqrt{2\pi}}\exp \left( - \log \sigma -\frac{x^{2} - 2x\mu + \mu^{2}}{2\sigma^{2}}\right) \\
+&= \frac{1}{\sqrt{2\pi}} \exp \left(\frac{2x\mu -x^{2}}{2\sigma^{2}} - \log\sigma - \frac{\mu^{2}}{2\sigma^{2}}\right) \\
+&= \frac{1}{\sqrt{2\pi}} \exp \left( 
+\begin{bmatrix} \frac{\mu}{\sigma^{2}} & -\frac{1}{2\sigma^{2}} \end{bmatrix} \times 
+\begin{bmatrix} x \\ x^{2} \end{bmatrix}
+- \log\sigma - \frac{\mu^{2}}{2\sigma^{2}}\right)
+\end{align}
+\end{subequations}
+We recognise this as the exponential family form of the Gaussian with
+\begin{equation}
+\begin{aligned}
+h(x) = \frac{1}{\sqrt{2\pi}} && t(x) &= \begin{bmatrix} x \\ x^{2} \end{bmatrix} \\
+A(\eta(\theta)) = \log \sigma + \frac{\mu^{2}}{2\sigma^{2}} && \eta(\theta) &= \begin{bmatrix} \frac{\mu}{\sigma^{2}} \\ -\frac{1}{2\sigma^{2}} \end{bmatrix} \ .
+\end{aligned}
+\end{equation}
+
+\paragraph{Exercise} Derive the exponential family form of the multivariate Gaussian, exponential and binomial distributions.
+
+\section{Entropy of Exponential Families}
+
+First, recall the definition of entropy:
+\begin{equation}
+\Ent{X} = -\E{\log(X)}
+\end{equation}
+A convenient property of exponential families is that the only terms depending on $ X $ are the base measure and the sufficient
+statistics. This means we can split up the expectation term.
+\begin{subequations}
+\begin{align}
+\Ent{X} &= -\E{\log(h(X)) + \eta(\theta))^{\top} t(X) - A(\eta(\theta))} \\
+&=- \E{\log(h(X))} - \eta(\theta))^{\top} \E{t(X)} + A(\eta(\theta)) \label{eq:expFamEntropy}
+\end{align}
+\end{subequations}
+In order to compute the expected sufficient statistics we exploit another convenient property
+of exponential families.
+\begin{equation}
+\E{t(X)} = \frac{d}{d\eta(\theta))}A(\eta(\theta))
+\end{equation}
+
+\paragraph{Gaussian Entropy} We now compute the entropy of the Gaussian using the above properties. First, we compute the vector
+of partial derivatives to get the expected sufficient statistics.
+\begin{equation}
+\frac{\partial}{\partial \frac{\mu}{\sigma^{2}}} \left\{ \log \sigma + \frac{\mu^{2}}{2\sigma^{2}} \right\} = 
+\frac{\partial}{\partial \mu} \sigma^{2} \frac{\mu^{2}}{2\sigma^{2}} = \mu
+\end{equation}
+\begin{subequations}
+\begin{align}
+-\frac{\partial}{\partial \frac{1}{2}\sigma^{-2}} \left\{ \log \sigma + \frac{\mu^{2}}{2\sigma^{2}} \right\} &=
+-\frac{\partial}{\partial \sigma^{-2}} 2\log \left(\sigma^{-2\times \left(-\frac{1}{2}\right)} \right) - \frac{\partial}{\partial \sigma^{-2}} \frac{2\mu^{2}}{2\sigma^{2}} \\
+&=\frac{\partial}{\partial \sigma^{-2}} \log \left(\sigma^{-2} \right) - \frac{\partial}{\partial \sigma^{-2}} \mu^{2}\sigma^{-2} =  \sigma^{2} - \mu^{2}
+\end{align}
+\end{subequations}
+
+In the case of the Gaussian we can easily verify that these derivatives are correct by computing the expected sufficient statistics
+directly.
+\begin{align}
+\E{X} = \mu && \E{X^{2}} = \sigma^{2} + \mu^{2}
+\end{align}
+For the second term we have used the equality $ var(X) = \E{X^{2}} - \E{X}^{2} $.
+
+Plugging the expected sufficient statistics into the general entropy term (Equation~\eqref{eq:expFamEntropy}), we get
+\begin{subequations}
+\begin{align}
+\Ent{X} &=- \loga{\frac{1}{\sqrt{2\pi}}} - \begin{bmatrix} \frac{\mu}{\sigma^{2}} & -\frac{1}{2\sigma^{2}} \end{bmatrix} \times 
+\begin{bmatrix} \mu \\ \sigma^{2} + \mu^{2} \end{bmatrix} + \log \sigma + \frac{\mu^{2}}{2\sigma^{2}} \\
+&= -\loga{\frac{1}{\sqrt{2\pi}}} - \frac{\mu^{2}}{\sigma^{2}} + \frac{1}{2} + \frac{\mu^{2}}{2\sigma^{2}} + \log \sigma + \frac{\mu^{2}}{2\sigma^{2}} \\
+&= -\loga{\frac{1}{\sqrt{2\pi}}} - \frac{\mu^{2}}{\sigma^{2}} + \frac{2\mu^{2}}{2\sigma^{2}} + \frac{1}{2} + \log \sigma \\
+&= -\loga{\frac{1}{\sqrt{2\pi}}} + \frac{1}{2} + \log \sigma \\
+&= \frac{1}{2} \log(2\pi) + \frac{1}{2} + \frac{1}{2} \loga{\sigma^{2}} \\
+&= \frac{1}{2} \left(\loga{2\pi\sigma^{2}} + 1\right) \ .
+\end{align}
+\end{subequations}
+As we would expect, the Gaussian entropy only depends on the variance but not on the mean.
+
+\paragraph{Exercise} Follow the steps above to compute the entropy of the multivariate Gaussian, exponential and binomial distributions.
+
+\end{document}
\ No newline at end of file