diff --git a/es/separable_natural_es.py b/es/separable_natural_es.py
index a1bd3de..c31071b 100644
--- a/es/separable_natural_es.py
+++ b/es/separable_natural_es.py
@@ -1,13 +1,17 @@
 import multiprocessing as mp
 import numpy as np
+import torch
+
 from . import lib
 
+
 def optimize(func, mu, sigma,
              learning_rate_mu=None, learning_rate_sigma=None, population_size=None,
              max_iter=2000,
              fitness_shaping=True, mirrored_sampling=True, record_history=False,
              rng=None,
-             parallel_threads=None):
+             parallel_threads=None,
+             optimizer=torch.optim.SGD):
     """
     Evolution strategies using the natural gradient of multinormal search distributions in natural coordinates.
     Does not consider covariances between parameters.
@@ -37,12 +41,21 @@ def optimize(func, mu, sigma,
     history_pop = []
     history_fitness = []
 
+    # convert mu to torch Variable and construct optimizer; force
+    # torch to use double representation
+    mu_torch = torch.autograd.Variable(torch.DoubleTensor(mu.copy()), requires_grad=True)
+    optimizer_mu = optimizer([mu_torch], lr=learning_rate_mu)
+
     while True:
-        s = rng.normal(0, 1, size=(population_size, *np.shape(mu)))
-        z = mu + sigma * s
+
+        # use numpy representation for generating individuals
+        mu_numpy = mu_torch.detach().numpy()
+
+        s = rng.normal(0, 1, size=(population_size, *np.shape(mu_numpy)))
+        z = mu_numpy + sigma * s
 
         if mirrored_sampling:
-            z = np.vstack([z, mu - sigma * s])
+            z = np.vstack([z, mu_numpy - sigma * s])
             s = np.vstack([s, -s])
 
         if parallel_threads is None:
@@ -65,26 +78,31 @@ def optimize(func, mu, sigma,
         else:
             utility = fitness
 
-        # update parameter of search distribution via natural gradient descent in natural coordinates
-        mu += learning_rate_mu * sigma * np.dot(utility, s)
-        sigma *= np.exp(learning_rate_sigma / 2. * np.dot(utility, s ** 2 - 1))
-
         if record_history:
-            history_mu.append(mu.copy())
+            history_mu.append(mu_numpy.copy())
             history_sigma.append(sigma.copy())
             history_pop.append(z.copy())
             history_fitness.append(fitness.copy())
 
-        generation += 1
-
         # exit if max iterations reached
         if generation > max_iter or np.all(sigma < 1e-10):
             break
 
-    return {'mu': mu,
+        # update parameters of search distribution via natural
+        # gradient descent in natural coordinates
+
+        # set gradient and use optimizer to update mu
+        mu_torch.grad = torch.autograd.Variable(torch.DoubleTensor(-sigma * np.dot(utility, s)))
+        optimizer_mu.step()
+
+        # manually update sigma
+        sigma *= np.exp(learning_rate_sigma / 2. * np.dot(utility, s ** 2 - 1))
+
+        generation += 1
+
+    return {'mu': mu_numpy,
             'sigma': sigma,
             'history_mu': history_mu,
             'history_sigma': history_sigma,
             'history_fitness': history_fitness,
             'history_pop': history_pop}
-  
\ No newline at end of file
diff --git a/test/test_snes.py b/test/test_snes.py
index c37de58..d08680f 100644
--- a/test/test_snes.py
+++ b/test/test_snes.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import sys
+import torch
 
 sys.path.append('../')
 
@@ -25,7 +26,8 @@ def test_quadratic_1d():
         def f(x):
             return functions.f_1d(x, x0)
 
-        res = snes.optimize(f, np.array([mu]), np.array([sigma]), max_iter=MAX_ITER)
+        res = snes.optimize(f, np.array([mu]), np.array([sigma]),
+                            max_iter=MAX_ITER, rng=SEED)
 
         assert(abs(res['mu'] - x0) < TOLERANCE_1D), SEED
 
@@ -43,7 +45,8 @@ def test_quadratic_2d():
         def f(x):
             return functions.f_2d(x, x0, y0)
 
-        res = snes.optimize(f, np.array([mu_x, mu_y]), np.array([sigma_x, sigma_y]), max_iter=MAX_ITER)
+        res = snes.optimize(f, np.array([mu_x, mu_y]),
+                            np.array([sigma_x, sigma_y]), max_iter=MAX_ITER, rng=SEED)
 
         assert(abs(res['mu'][0] - x0) < TOLERANCE_2D), SEED
         assert(abs(res['mu'][1] - y0) < TOLERANCE_2D), SEED
@@ -62,7 +65,9 @@ def test_quadratic_2d_non_isotropic():
         def f(x):
             return functions.f_2d_nonisotropic(x, x0, y0)
 
-        res = snes.optimize(f, np.array([mu_x, mu_y]), np.array([sigma_x, sigma_y]), max_iter=MAX_ITER, record_history=True)
+        res = snes.optimize(f, np.array([mu_x, mu_y]),
+                            np.array([sigma_x, sigma_y]), max_iter=MAX_ITER,
+                            record_history=True, rng=SEED)
 
         assert(abs(res['mu'][0] - x0) < TOLERANCE_2D), SEED
         assert(abs(res['mu'][1] - y0) < TOLERANCE_2D), SEED
@@ -91,7 +96,69 @@ def f(x):
 
         res = snes.optimize(f, np.array([mu_x, mu_y]), np.array([sigma_x, sigma_y]),
                             # learning_rate_mu=0.1, learning_rate_sigma=0.00025,
-                            max_iter=MAX_ITER_ROSENBROCK)
+                            max_iter=MAX_ITER_ROSENBROCK,
+                            rng=SEED)
 
         assert(abs(res['mu'][0] - theo_min[0]) < TOLERANCE_ROSENBROCK), SEED
         assert(abs(res['mu'][1] - theo_min[1]) < TOLERANCE_ROSENBROCK), SEED
+
+
+def test_ann():
+    np.random.seed(SEED)
+    torch.manual_seed(SEED)
+
+    criterion = torch.nn.MSELoss()
+    trials = 2
+    inner_iterations = 50
+    in_features = 2
+    hidden_units = 4
+    out_features = 1
+
+    class ANN(torch.nn.Module):
+
+        def __init__(self):
+            super().__init__()
+
+            self.fc1 = torch.nn.Linear(in_features, hidden_units)
+            self.fc2 = torch.nn.Linear(hidden_units, out_features)
+
+        def forward(self, x):
+            h = torch.nn.functional.tanh(self.fc1(x))
+            return self.fc2(h)
+
+        def set_parameters(self, z):
+            offset = 0
+            for m in self.children():
+
+                weight_size = m.in_features * m.out_features
+                m.weight.data = torch.Tensor(z[offset:offset + weight_size].reshape(m.out_features, m.in_features))
+                offset += weight_size
+
+                bias_size = m.out_features
+                m.bias.data = torch.Tensor(z[offset:offset + bias_size])
+                offset += bias_size
+
+    for _ in range(trials):
+
+        model_target = ANN()
+        model = ANN()
+
+        def f(z):
+
+            model.set_parameters(z)
+
+            loss = 0
+            for x in 2 * torch.rand(inner_iterations, in_features) - 1:
+                target = torch.autograd.Variable(model_target(x), requires_grad=False)
+                loss += criterion(model(x), target)
+
+            return -loss / inner_iterations
+
+        param_count = in_features * hidden_units + hidden_units + hidden_units * out_features + out_features
+        mu = np.random.randn(param_count)
+        sigma = np.ones(param_count)
+
+        res = snes.optimize(f, mu, sigma, record_history=True,
+                            max_iter=100, rng=SEED, optimizer=torch.optim.SGD)
+
+        assert(abs(np.mean(res['history_fitness'][-1])) < 0.1)