diff --git a/.clang-format b/.clang-format
index 1c96eda..04ec738 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,6 +1,8 @@
-Language:        Cpp
+Language:      Cpp
 BasedOnStyle:  Google
 AllowShortFunctionsOnASingleLine: None
 AllowShortIfStatementsOnASingleLine: false
 AllowShortLoopsOnASingleLine: false
-PenaltyBreakComment: 100
+BinPackArguments:  false
+BinPackParameters: false
+ReflowComments:    false
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..ea4b8ea
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,8 @@
+---
+Checks: 'clang-diagnostic-*,clang-analyzer-*,bugprone-*,cert-*,-cert-msc*,performance-*,modernize-*,-modernize-use-trailing-return-type,google-*,-modernize-avoid-c-arrays,cppcoreguidelines-*,-cppcoreguidelines-avoid-magic-numbers,-cppcoreguidelines-pro-*,-cppcoreguidelines-init-*,-cppcoreguidelines-owning-memory,-cppcoreguidelines-avoid-c-arrays,readability-*,-readability-magic-numbers,-readability-function-cognitive-complexity,-readability-function-size,-readability-identifier-naming'
+WarningsAsErrors: ''
+HeaderFilterRegex: '.*'
+AnalyzeTemporaryDtors: false
+FormatStyle:     file
+...
+
diff --git a/setup.py b/setup.py
index 709dbb0..2b24167 100644
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,7 @@
                                       'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'],
                              include_dirs=[
                                  numpy.get_include(), 'tsne/bh_sne_src/'],
+                             define_macros=[('BUILDING_TSNE_DLL', None)],
                              extra_compile_args=extra_compile_args +
                              ['-ffast-math', '-O3', '-std=c++14'],
                              extra_link_args=['-Wl,-framework',
@@ -63,6 +64,7 @@
                                       'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'],
                              include_dirs=[
                                  numpy.get_include(), 'tsne/bh_sne_src/'],
+                             define_macros=[('BUILDING_TSNE_DLL', None)],
                              extra_compile_args=opt_flags +
                              ['-Wall', '-fPIC', '-std=c++14', '-w'],
                              extra_link_args=ldflags,
diff --git a/tsne/bh_sne_src/Makefile b/tsne/bh_sne_src/Makefile
index 95534dc..82658e4 100644
--- a/tsne/bh_sne_src/Makefile
+++ b/tsne/bh_sne_src/Makefile
@@ -3,7 +3,7 @@
 #CFLAGS = -march=haswell -ffast-math -O3
 
 CXX?=g++
-CFLAGS?=-ffast-math -O3 -Wall -std=c++14
+CFLAGS?=-ffast-math -O3 -Wall -std=c++17
 
 all: bh_tsne
 
diff --git a/tsne/bh_sne_src/main.cpp b/tsne/bh_sne_src/main.cpp
index 6fe7ec5..539b65f 100644
--- a/tsne/bh_sne_src/main.cpp
+++ b/tsne/bh_sne_src/main.cpp
@@ -37,15 +37,25 @@
 
 #include "tsne.h"
 
-using namespace std;
+using std::fclose;
+using std::feof;
+using std::fprintf;
+using std::fread;
+using std::vector;
 
 // Function that loads data from a t-SNE file
-bool load_data(const char* dat_file, vector<double>* data, int* n, int* d,
-               int* no_dims, double* theta, double* perplexity, int* rand_seed,
+bool load_data(const char* dat_file,
+               vector<double>* data,
+               int* n,
+               int* d,
+               int* no_dims,
+               double* theta,
+               double* perplexity,
+               int* rand_seed,
                int* max_iter) {
   // Open file, read first 2 integers, allocate memory, and read the data
-  FILE* h;
-  if ((h = fopen(dat_file, "r+b")) == nullptr) {
+  auto* h = fopen(dat_file, "r+b");
+  if (h == nullptr) {
     fprintf(stderr, "Error: could not open data file.\n");
     return false;
   }
@@ -57,16 +67,20 @@ bool load_data(const char* dat_file, vector<double>* data, int* n, int* d,
   fread(max_iter, sizeof(int), 1, h);       // maximum number of iterations
   data->resize(*d * *n);
   fread(data->data(), sizeof(double), *n * *d, h);  // the data
-  if (!feof(h))
+  if (feof(h) == 0) {
     fread(rand_seed, sizeof(int), 1, h);  // random seed
+  }
   fclose(h);
   fprintf(stderr, "Read the %i x %i data matrix successfully!\n", *n, *d);
   return true;
 }
 
 // Function that saves map to a t-SNE file
-void save_data(const char* res_file, const vector<double>& data,
-               const vector<int>& landmarks, const vector<double>& costs, int n,
+void save_data(const char* res_file,
+               const vector<double>& data,
+               const vector<int>& landmarks,
+               const vector<double>& costs,
+               int n,
                int d) {
   // Open file, write first 2 integers and then the data
   FILE* h;
@@ -116,25 +130,48 @@ int main(int argc, char* argv[]) {
   const char* res_file_c = res_file.c_str();
 
   // Define some variables
-  int origN, N, D, no_dims, max_iter;
-  double perplexity, theta;
+  int origN;
+  int N;
+  int D;
+  int no_dims;
+  int max_iter;
+  double perplexity;
+  double theta;
   vector<double> data;
   int rand_seed = -1;
 
   // Read the parameters and the dataset
-  if (load_data(dat_file_c, &data, &origN, &D, &no_dims, &theta, &perplexity,
-                &rand_seed, &max_iter)) {
+  if (load_data(dat_file_c,
+                &data,
+                &origN,
+                &D,
+                &no_dims,
+                &theta,
+                &perplexity,
+                &rand_seed,
+                &max_iter)) {
     // Make dummy landmarks
     N = origN;
     vector<int> landmarks(N);
-    for (int n = 0; n < N; n++)
+    for (int n = 0; n < N; n++) {
       landmarks[n] = n;
+    }
 
     // Now fire up the SNE implementation
     vector<double> Y(N * no_dims);
     vector<double> costs(N);
-    run(data.data(), N, D, Y.data(), no_dims, perplexity, theta, rand_seed,
-        false, nullptr, false, max_iter);
+    run(data.data(),
+        N,
+        D,
+        Y.data(),
+        no_dims,
+        perplexity,
+        theta,
+        rand_seed,
+        false,
+        nullptr,
+        false,
+        max_iter);
 
     // Save the results
     save_data(res_file_c, Y, landmarks, costs, N, no_dims);
diff --git a/tsne/bh_sne_src/sptree.cpp b/tsne/bh_sne_src/sptree.cpp
index dbad881..a94d359 100644
--- a/tsne/bh_sne_src/sptree.cpp
+++ b/tsne/bh_sne_src/sptree.cpp
@@ -31,15 +31,17 @@
  *
  */
 
+#include "sptree.h"
+
 #include <array>
 #include <cassert>
 #include <cfloat>
 #include <cmath>
 #include <cstdio>
 
-#include "sptree.h"
-
+#ifdef __GNUC__
 #pragma GCC visibility push(hidden)
+#endif
 
 using std::array;
 using std::make_unique;
@@ -47,10 +49,12 @@ using std::vector;
 
 template <int NDims>
 Cell<NDims>::Cell(const double* inp_corner, const double* inp_width) {
-  for (int d = 0; d < NDims; d++)
+  for (int d = 0; d < NDims; d++) {
     setCorner(d, inp_corner[d]);
-  for (int d = 0; d < NDims; d++)
+  }
+  for (int d = 0; d < NDims; d++) {
     setWidth(d, inp_width[d]);
+  }
 }
 
 template <int NDims>
@@ -78,10 +82,12 @@ template <int NDims>
 bool Cell<NDims>::containsPoint(const double point[]) const {
   assert(point);
   for (int d = 0; d < NDims; d++) {
-    if (corner[d] - width[d] > point[d])
+    if (corner[d] - width[d] > point[d]) {
       return false;
-    if (corner[d] + width[d] < point[d])
+    }
+    if (corner[d] + width[d] < point[d]) {
       return false;
+    }
   }
   return true;
 }
@@ -101,21 +107,25 @@ SPTree<NDims>::SPTree(const double* inp_data, unsigned int N) : data(inp_data) {
   for (unsigned int n = 0; n < N; n++) {
     for (unsigned int d = 0; d < NDims; d++) {
       mean_Y[d] += inp_data[n * NDims + d];
-      if (inp_data[nD + d] < min_Y[d])
+      if (inp_data[nD + d] < min_Y[d]) {
         min_Y[d] = inp_data[nD + d];
-      if (inp_data[nD + d] > max_Y[d])
+      }
+      if (inp_data[nD + d] > max_Y[d]) {
         max_Y[d] = inp_data[nD + d];
+      }
     }
     nD += NDims;
   }
 
-  for (int d = 0; d < NDims; d++)
-    mean_Y[d] /= (double)N;
+  for (int d = 0; d < NDims; d++) {
+    mean_Y[d] /= static_cast<double>(N);
+  }
 
   // Construct SPTree
   array<double, NDims> width;
-  for (int d = 0; d < NDims; d++)
+  for (int d = 0; d < NDims; d++) {
     width[d] = fmax(max_Y[d] - mean_Y[d], mean_Y[d] - min_Y[d]) + 1e-5;
+  }
   node.init(mean_Y.data(), width.data());
   fill(N);
 }
@@ -126,13 +136,16 @@ void SPTree<NDims>::Node::init(const double* inp_corner,
                                const double* inp_width) {
   cum_size = 0;
 
-  for (unsigned int d = 0; d < NDims; d++)
+  for (unsigned int d = 0; d < NDims; d++) {
     boundary.setCorner(d, inp_corner[d]);
-  for (unsigned int d = 0; d < NDims; d++)
+  }
+  for (unsigned int d = 0; d < NDims; d++) {
     boundary.setWidth(d, inp_width[d]);
+  }
 
-  for (unsigned int d = 0; d < NDims; d++)
+  for (unsigned int d = 0; d < NDims; d++) {
     center_of_mass[d] = .0;
+  }
 }
 
 // Insert a point into the SPTree
@@ -141,13 +154,15 @@ bool SPTree<NDims>::Node::insert(const double* data, unsigned int new_index) {
   assert(data);
   // Ignore objects which do not belong in this quad tree
   const double* point = data + new_index * NDims;
-  if (!boundary.containsPoint(point))
+  if (!boundary.containsPoint(point)) {
     return false;
+  }
 
   // Online update of cumulative size and center-of-mass
   ++cum_size;
-  double mult1 = (double)(cum_size - 1) / (double)cum_size;
-  double mult2 = 1.0 / (double)cum_size;
+  double mult1 =
+      static_cast<double>(cum_size - 1) / static_cast<double>(cum_size);
+  double mult2 = 1.0 / static_cast<double>(cum_size);
 
   for (unsigned int d = 0; d < NDims; d++) {
     center_of_mass[d] = center_of_mass[d] * mult1 + mult2 * point[d];
@@ -168,18 +183,22 @@ bool SPTree<NDims>::Node::insert(const double* data, unsigned int new_index) {
         break;
       }
     }
-    if (duplicate)
+    if (duplicate) {
       return true;
+    }
   }
 
   // Otherwise, we need to subdivide the current cell
-  if (!children)
+  if (!children) {
     subdivide(data);
+  }
 
   // Find out where the point can be inserted
-  for (auto& child : *children)
-    if (child.insert(data, new_index))
+  for (auto& child : *children) {
+    if (child.insert(data, new_index)) {
       return true;
+    }
+  }
   // Otherwise, the point cannot be inserted (this should never happen)
   return false;
 }
@@ -199,10 +218,11 @@ void SPTree<NDims>::Node::subdivide(const double* data) {
   for (unsigned int i = 0; i < no_children; i++) {
     unsigned int div = 1;
     for (unsigned int d = 0; d < NDims; d++) {
-      if ((i / div) % 2 == 1)
+      if ((i / div) % 2 == 1) {
         new_corner[d] = boundary.getCorner(d) - .5 * boundary.getWidth(d);
-      else
+      } else {
         new_corner[d] = boundary.getCorner(d) + .5 * boundary.getWidth(d);
+      }
       div *= 2;
     }
     (*children)[i].init(new_corner, new_width);
@@ -220,8 +240,9 @@ void SPTree<NDims>::Node::subdivide(const double* data) {
 // Build SPTree on dataset
 template <int NDims>
 void SPTree<NDims>::fill(unsigned int N) {
-  for (unsigned int i = 0; i < N; i++)
+  for (unsigned int i = 0; i < N; i++) {
     node.insert(data, i);
+  }
 }
 
 // Checks whether the specified tree is correct
@@ -234,8 +255,9 @@ template <int NDims>
 bool SPTree<NDims>::Node::isCorrect(const double* data) const {
   if (!children && cum_size > 0) {
     const double* point = data + index * NDims;
-    if (!boundary.containsPoint(point))
+    if (!boundary.containsPoint(point)) {
       return false;
+    }
   }
   if (children) {
     for (const auto& child : *children) {
@@ -264,26 +286,33 @@ unsigned int SPTree<NDims>::Node::getAllIndices(unsigned int* indices,
 
   // Gather indices in children
   if (children) {
-    for (auto& child : *children)
+    for (auto& child : *children) {
       loc = child.getAllIndices(indices, loc);
+    }
   }
   return loc;
 }
 
 // Compute non-edge forces using Barnes-Hut algorithm
 template <int NDims>
-void SPTree<NDims>::computeNonEdgeForces(unsigned int point_index, double theta,
-                                         double neg_f[], double* sum_Q) const {
+void SPTree<NDims>::computeNonEdgeForces(unsigned int point_index,
+                                         double theta,
+                                         double neg_f[],
+                                         double* sum_Q) const {
   node.computeNonEdgeForces(data, point_index, theta, neg_f, sum_Q);
 }
 
 template <int NDims>
 [[gnu::hot]] void SPTree<NDims>::Node::computeNonEdgeForces(
-    const double* data, unsigned int point_index, double theta, double neg_f[],
+    const double* data,
+    unsigned int point_index,
+    double theta,
+    double neg_f[],
     double* sum_Q) const {
   // Make sure that we spend no time on empty nodes or self-interactions
-  if (cum_size == 0 || (!children && cum_size >= 1 && index == point_index))
+  if (cum_size == 0 || (!children && cum_size >= 1 && index == point_index)) {
     return;
+  }
 
   // Compute distance between point and center-of-mass
   double sqdist = .0;
@@ -307,12 +336,14 @@ template <int NDims>
     double mult = cum_size * sqdist;
     *sum_Q += mult;
     mult *= sqdist;
-    for (unsigned int d = 0; d < NDims; d++)
+    for (unsigned int d = 0; d < NDims; d++) {
       neg_f[d] += mult * buff[d];
+    }
   } else {
     // Recursively apply Barnes-Hut to children
-    for (const auto& child : *children)
+    for (const auto& child : *children) {
       child.computeNonEdgeForces(data, point_index, theta, neg_f, sum_Q);
+    }
   }
 }
 
@@ -351,8 +382,9 @@ vector<double> SPTree<NDims>::Node::computeEdgeForces(const double* data,
       sqdist = val_P[i] / sqdist;
 
       // Sum positive force
-      for (unsigned int d = 0; d < NDims; d++)
+      for (unsigned int d = 0; d < NDims; d++) {
         pos_f[ind1 + d] += sqdist * buff[d];
+      }
     }
     ind1 += NDims;
   }
@@ -376,18 +408,21 @@ void SPTree<NDims>::Node::print(const double* data) const {
     fprintf(stderr, "Leaf node; data = [");
     if (!children && cum_size > 0) {
       const double* point = data + index * NDims;
-      for (int d = 0; d < NDims; d++)
+      for (int d = 0; d < NDims; d++) {
         fprintf(stderr, "%f, ", point[d]);
+      }
       fprintf(stderr, " (index = %d)", index);
     }
     fprintf(stderr, "]\n");
   } else {
     fprintf(stderr, "Intersection node with center-of-mass = [");
-    for (int d = 0; d < NDims; d++)
+    for (int d = 0; d < NDims; d++) {
       fprintf(stderr, "%f, ", center_of_mass[d]);
+    }
     fprintf(stderr, "]; children are:\n");
-    for (const auto& child : *children)
+    for (const auto& child : *children) {
       child.print(data);
+    }
   }
 }
 
@@ -395,4 +430,6 @@ void SPTree<NDims>::Node::print(const double* data) const {
 template class SPTree<2>;
 template class SPTree<3>;
 
+#ifdef __GNUC__
 #pragma GCC visibility pop
+#endif
diff --git a/tsne/bh_sne_src/sptree.h b/tsne/bh_sne_src/sptree.h
index e399b60..0382aaa 100644
--- a/tsne/bh_sne_src/sptree.h
+++ b/tsne/bh_sne_src/sptree.h
@@ -38,7 +38,9 @@
 #include <memory>
 #include <vector>
 
+#ifdef __GNUC__
 #pragma GCC visibility push(hidden)
+#endif
 
 template <int NDims = 2>
 class alignas(16) Cell {
@@ -48,10 +50,14 @@ class alignas(16) Cell {
  public:
   Cell() = default;
   Cell(const double* inp_corner, const double* inp_width);
+  Cell(const Cell&) = delete;
+  Cell(Cell&&) noexcept = default;
+  Cell& operator=(const Cell&) = delete;
+  Cell& operator=(Cell&&) noexcept = default;
   ~Cell() = default;
 
-  double getCorner(unsigned int d) const;
-  double getWidth(unsigned int d) const;
+  [[nodiscard]] double getCorner(unsigned int d) const;
+  [[nodiscard]] double getWidth(unsigned int d) const;
   void setCorner(unsigned int d, double val);
   void setWidth(unsigned int d, double val);
   bool containsPoint(const double point[]) const;
@@ -69,13 +75,16 @@ class SPTree {
     bool insert(const double* data, unsigned int new_index);
     void subdivide(const double* data);
     void print(const double* data) const;
-    void computeNonEdgeForces(const double* data, unsigned int point_index,
-                              double theta, double neg_f[],
+    void computeNonEdgeForces(const double* data,
+                              unsigned int point_index,
+                              double theta,
+                              double neg_f[],
                               double* sum_Q) const;
     std::vector<double> computeEdgeForces(const double* data,
                                           const unsigned int* row_P,
                                           const unsigned int* col_P,
-                                          const double* val_P, int N) const;
+                                          const double* val_P,
+                                          int N) const;
     unsigned int getAllIndices(unsigned int* indices, unsigned int loc) const;
     void init(const double* inp_corner, const double* inp_width);
 
@@ -100,21 +109,24 @@ class SPTree {
 
   Node node;
 
-  SPTree(const SPTree<NDims>&) = delete;
-  SPTree& operator=(const SPTree<NDims>&) = delete;
-
  public:
   SPTree() = default;
-  SPTree(SPTree<NDims>&&) = default;
+  SPTree(SPTree<NDims>&&) noexcept = default;
+  SPTree(const SPTree<NDims>&) = delete;
+  SPTree& operator=(SPTree<NDims>&&) noexcept = default;
+  SPTree& operator=(const SPTree<NDims>&) = delete;
   SPTree(const double* inp_data, unsigned int N);
   ~SPTree() = default;
-  bool isCorrect() const;
+  [[nodiscard]] bool isCorrect() const;
   void getAllIndices(unsigned int* indices) const;
-  void computeNonEdgeForces(unsigned int point_index, double theta,
-                            double neg_f[], double* sum_Q) const;
+  void computeNonEdgeForces(unsigned int point_index,
+                            double theta,
+                            double neg_f[],
+                            double* sum_Q) const;
   std::vector<double> computeEdgeForces(const unsigned int* row_P,
                                         const unsigned int* col_P,
-                                        const double* val_P, int N) const;
+                                        const double* val_P,
+                                        int N) const;
   void print() const;
 
  private:
@@ -126,5 +138,8 @@ struct SPTree<0> {
   enum { no_children = 1 };
 };
 
+#ifdef __GNUC__
 #pragma GCC visibility pop
 #endif
+
+#endif  // !defined(SPTREE_H)
diff --git a/tsne/bh_sne_src/tsne.cpp b/tsne/bh_sne_src/tsne.cpp
index da43130..875a1f2 100644
--- a/tsne/bh_sne_src/tsne.cpp
+++ b/tsne/bh_sne_src/tsne.cpp
@@ -31,6 +31,8 @@
  *
  */
 
+#include "tsne.h"
+
 #include <array>
 #include <cassert>
 #include <cfloat>
@@ -38,14 +40,16 @@
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
+#include <memory>
 #include <utility>
 #include <vector>
 
 #include "sptree.h"
-#include "tsne.h"
 #include "vptree.h"
 
+#ifdef __GNUC__
 #pragma GCC visibility push(hidden)
+#endif
 
 using std::abs;
 using std::array;
@@ -54,46 +58,132 @@ using std::fprintf;
 using std::log;
 using std::move;
 using std::sqrt;
+using std::unique_ptr;
 using std::vector;
 
 namespace {
 
+struct TSNEState {
+ public:
+  TSNEState(int N,
+            double* const Y,
+            int no_dims,
+            double theta,
+            int max_iter,
+            int stop_lying_iter,
+            int mom_switch_iter,
+            int iter,
+            float total_time,
+            float residual_time,
+            vector<double>&& P,
+            vector<unsigned int>&& row_P,
+            vector<unsigned int>&& col_P,
+            vector<double>&& val_P,
+            vector<double>&& dY,
+            vector<double>&& uY,
+            vector<double>&& gains)
+      : N{N},
+        Y{Y},
+        no_dims{no_dims},
+        theta{theta},
+        max_iter{max_iter},
+        stop_lying_iter{stop_lying_iter},
+        mom_switch_iter{mom_switch_iter},
+        iter{iter},
+        total_time{total_time},
+        residual_time{residual_time},
+        P{P},
+        row_P{row_P},
+        col_P{col_P},
+        val_P{val_P},
+        dY{dY},
+        uY{uY},
+        gains{gains} {
+  }
+
+  bool step_by(int step);
+
+ private:
+  template <int NDIMS>
+  bool step_by_impl(int step);
+
+  const int N;
+  double* const Y;
+  const int no_dims;
+  const double theta;
+  const int max_iter;
+  const int stop_lying_iter;
+  const int mom_switch_iter;
+  int iter;
+  float total_time;
+  float residual_time;
+  vector<double> P;
+  vector<unsigned int> row_P;
+  vector<unsigned int> col_P;
+  vector<double> val_P;
+  vector<double> dY;
+  vector<double> uY;
+  vector<double> gains;
+};
+
 template <int NDIMS>
-void run(double* X, int N, int D, double* Y, double perplexity, double theta,
-         int rand_seed, bool skip_random_init, double* init, bool use_init,
-         int max_iter, int stop_lying_iter, int mom_switch_iter);
+void run(double* X,
+         int N,
+         int D,
+         double* Y,
+         double perplexity,
+         double theta,
+         int rand_seed,
+         bool skip_random_init,
+         double* init,
+         bool use_init,
+         int max_iter,
+         int stop_lying_iter,
+         int mom_switch_iter);
 
 template <int NDIMS>
 void computeGradient(const vector<unsigned int>& inp_row_P,
                      const vector<unsigned int>& inp_col_P,
-                     const vector<double>& inp_val_P, double* Y, int N,
-                     vector<double>& dC, double theta);
+                     const vector<double>& inp_val_P,
+                     const double* Y,
+                     int N,
+                     vector<double>* _dC,
+                     double theta);
 template <int D>
-void computeExactGradient(const vector<double>& P, double* Y, int N,
-                          double* dC);
+void computeExactGradient(const vector<double>& P,
+                          const double* Y,
+                          int N,
+                          vector<double>* _dC);
 template <int D>
-double evaluateError(const vector<double>& P, double* Y, int N);
+double evaluateError(const vector<double>& P, const double* Y, int N);
 template <int NDIMS>
 double evaluateError(const vector<unsigned int>& row_P,
                      const vector<unsigned int>& col_P,
-                     const vector<double>& val_P, double* Y, int N,
+                     const vector<double>& val_P,
+                     const double* Y,
+                     int N,
                      double theta);
 void zeroMean(double* X, int N, int D);
 template <int D>
 void zeroMean(double* X, int N);
-void computeGaussianPerplexity(double* X, int N, int D, double* P,
-                               double perplexity);
-void computeGaussianPerplexity(double* X, int N, int D,
+void computeGaussianPerplexity(
+    const double* X, int N, int D, double* P, double perplexity);
+void computeGaussianPerplexity(const double* X,
+                               int N,
+                               int D,
                                vector<unsigned int>* _row_P,
                                vector<unsigned int>* _col_P,
-                               vector<double>* _val_P, double perplexity,
+                               vector<double>* _val_P,
+                               double perplexity,
                                int K);
-vector<double> computeSquaredEuclideanDistance(double* X, int N, int D);
+vector<double> computeSquaredEuclideanDistance(const double* X, int N, int D);
 double randn();
-void symmetrizeMatrix(vector<unsigned int>* row_P, vector<unsigned int>* col_P,
-                      vector<double>* val_P, int N);
+void symmetrizeMatrix(vector<unsigned int>* row_P,
+                      vector<unsigned int>* col_P,
+                      vector<double>* val_P,
+                      int N);
 
-static inline double sign(double x) {
+inline double sign(const double x) {
   return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0));
 }
 
@@ -101,20 +191,25 @@ static inline double sign(double x) {
 template <int NDIMS>
 void computeGradient(const vector<unsigned int>& inp_row_P,
                      const vector<unsigned int>& inp_col_P,
-                     const vector<double>& inp_val_P, double* Y, int N,
-                     vector<double>& dC, double theta) {
+                     const vector<double>& inp_val_P,
+                     const double* const Y,
+                     const int N,
+                     vector<double>* const _dC,
+                     const double theta) {
   // Construct space-partitioning tree on current map
   SPTree<NDIMS> tree(Y, N);
 
   // Compute all terms required for t-SNE gradient
   double sum_Q = .0;
   vector<double> neg_f(N * NDIMS);
-  auto pos_f = tree.computeEdgeForces(inp_row_P.data(), inp_col_P.data(),
-                                      inp_val_P.data(), N);
-  for (int n = 0; n < N; n++)
+  auto pos_f = tree.computeEdgeForces(
+      inp_row_P.data(), inp_col_P.data(), inp_val_P.data(), N);
+  for (int n = 0; n < N; n++) {
     tree.computeNonEdgeForces(n, theta, neg_f.data() + n * NDIMS, &sum_Q);
+  }
 
   // Compute final t-SNE gradient
+  vector<double>& dC = *_dC;
   for (int i = 0; i < N * NDIMS; i++) {
     dC[i] = pos_f[i] - (neg_f[i] / sum_Q);
   }
@@ -122,9 +217,12 @@ void computeGradient(const vector<unsigned int>& inp_row_P,
 
 // Compute gradient of the t-SNE cost function (exact)
 template <int D>
-void computeExactGradient(const vector<double>& P, double* Y, int N,
-                          vector<double>& dC) {
+void computeExactGradient(const vector<double>& P,
+                          double* const Y,
+                          const int N,
+                          vector<double>* const _dC) {
   // Make sure the current gradient contains zeros
+  vector<double>& dC = *_dC;
   dC.assign(N * D, 0.0);
 
   // Compute the squared Euclidean distance matrix
@@ -165,7 +263,9 @@ void computeExactGradient(const vector<double>& P, double* Y, int N,
 
 // Evaluate t-SNE cost function (exactly)
 template <int D>
-double evaluateError(const vector<double>& P, double* Y, int N) {
+double evaluateError(const vector<double>& P,
+                     const double* const Y,
+                     const int N) {
   // Compute the squared Euclidean distance matrix
   vector<double> DD = computeSquaredEuclideanDistance(Y, N, D);
   vector<double> Q(N * N);
@@ -178,13 +278,15 @@ double evaluateError(const vector<double>& P, double* Y, int N) {
       if (n != m) {
         Q[nN + m] = 1 / (1 + DD[nN + m]);
         sum_Q += Q[nN + m];
-      } else
+      } else {
         Q[nN + m] = DBL_MIN;
+      }
     }
     nN += N;
   }
-  for (int i = 0; i < N * N; i++)
+  for (int i = 0; i < N * N; i++) {
     Q[i] /= sum_Q;
+  }
 
   // Sum t-SNE error
   double C = .0;
@@ -198,32 +300,40 @@ double evaluateError(const vector<double>& P, double* Y, int N) {
 template <int NDIMS>
 double evaluateError(const vector<unsigned int>& row_P,
                      const vector<unsigned int>& col_P,
-                     const vector<double>& val_P, double* Y, int N,
-                     double theta) {
+                     const vector<double>& val_P,
+                     const double* const Y,
+                     const int N,
+                     const double theta) {
   // Get estimate of normalization term
   array<double, NDIMS> buff;
   buff.fill(0);
   double sum_Q = .0;
   {
     SPTree<NDIMS> tree(Y, N);
-    for (int n = 0; n < N; n++)
+    for (int n = 0; n < N; n++) {
       tree.computeNonEdgeForces(n, theta, buff.data(), &sum_Q);
+    }
   }
 
   // Loop over all edges to compute t-SNE error
-  int ind1, ind2;
-  double C = .0, Q;
+  int ind1;
+  int ind2;
+  double C = .0;
+  double Q;
   for (int n = 0; n < N; n++) {
     ind1 = n * NDIMS;
-    for (int i = row_P[n]; i < row_P[n + 1]; i++) {
+    for (unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
       Q = .0;
       ind2 = col_P[i] * NDIMS;
-      for (int d = 0; d < NDIMS; d++)
+      for (int d = 0; d < NDIMS; d++) {
         buff[d] = Y[ind1 + d];
-      for (int d = 0; d < NDIMS; d++)
+      }
+      for (int d = 0; d < NDIMS; d++) {
         buff[d] -= Y[ind2 + d];
-      for (int d = 0; d < NDIMS; d++)
+      }
+      for (int d = 0; d < NDIMS; d++) {
         Q += buff[d] * buff[d];
+      }
       Q = (1.0 / (1.0 + Q)) / sum_Q;
       C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN));
     }
@@ -234,8 +344,11 @@ double evaluateError(const vector<unsigned int>& row_P,
 }
 
 // Compute input similarities with a fixed perplexity
-void computeGaussianPerplexity(double* X, int N, int D, double* P,
-                               double perplexity) {
+void computeGaussianPerplexity(const double* const X,
+                               const int N,
+                               const int D,
+                               double* const P,
+                               const double perplexity) {
   // Compute the squared Euclidean distance matrix
   vector<double> DD = computeSquaredEuclideanDistance(X, N, D);
 
@@ -254,17 +367,20 @@ void computeGaussianPerplexity(double* X, int N, int D, double* P,
     int iter = 0;
     while (!found && iter < 200) {
       // Compute Gaussian kernel row
-      for (int m = 0; m < N; m++)
+      for (int m = 0; m < N; m++) {
         P[nN + m] = exp(-beta * DD[nN + m]);
+      }
       P[nN + n] = DBL_MIN;
 
       // Compute entropy of current row
       sum_P = DBL_MIN;
-      for (int m = 0; m < N; m++)
+      for (int m = 0; m < N; m++) {
         sum_P += P[nN + m];
+      }
       double H = 0.0;
-      for (int m = 0; m < N; m++)
+      for (int m = 0; m < N; m++) {
         H += beta * (DD[nN + m] * P[nN + m]);
+      }
       H = (H / sum_P) + log(sum_P);
 
       // Evaluate whether the entropy is within the tolerance level
@@ -274,16 +390,18 @@ void computeGaussianPerplexity(double* X, int N, int D, double* P,
       } else {
         if (Hdiff > 0) {
           min_beta = beta;
-          if (max_beta == DBL_MAX || max_beta == -DBL_MAX)
+          if (max_beta == DBL_MAX || max_beta == -DBL_MAX) {
             beta *= 2.0;
-          else
+          } else {
             beta = (beta + max_beta) / 2.0;
+          }
         } else {
           max_beta = beta;
-          if (min_beta == -DBL_MAX || min_beta == DBL_MAX)
+          if (min_beta == -DBL_MAX || min_beta == DBL_MAX) {
             beta /= 2.0;
-          else
+          } else {
             beta = (beta + min_beta) / 2.0;
+          }
         }
       }
 
@@ -292,20 +410,25 @@ void computeGaussianPerplexity(double* X, int N, int D, double* P,
     }
 
     // Row normalize P
-    for (int m = 0; m < N; m++)
+    for (int m = 0; m < N; m++) {
       P[nN + m] /= sum_P;
+    }
     nN += N;
   }
 }
 
 // Compute input similarities with a fixed perplexity using ball trees.
-void computeGaussianPerplexity(double* X, int N, int D,
+void computeGaussianPerplexity(const double* const X,
+                               const int N,
+                               const int D,
                                vector<unsigned int>* _row_P,
                                vector<unsigned int>* _col_P,
-                               vector<double>* _val_P, double perplexity,
-                               int K) {
-  if (perplexity > K)
+                               vector<double>* _val_P,
+                               const double perplexity,
+                               const int K) {
+  if (perplexity > K) {
     fprintf(stderr, "Perplexity should be lower than K!\n");
+  }
 
   // Allocate the memory we need
   _row_P->resize(N + 1);
@@ -316,14 +439,16 @@ void computeGaussianPerplexity(double* X, int N, int D,
   vector<double>& val_P = *_val_P;
   vector<double> cur_P(N - 1);
   row_P[0] = 0;
-  for (int n = 0; n < N; n++)
-    row_P[n + 1] = row_P[n] + (unsigned int)K;
+  for (int n = 0; n < N; n++) {
+    row_P[n + 1] = row_P[n] + static_cast<unsigned int>(K);
+  }
 
   // Build ball tree on data set
-  VpTree<unsigned int, euclidean_distance> tree((euclidean_distance(D, X)));
+  VpTree<unsigned int, euclidean_distance> tree(euclidean_distance(D, X));
   vector<unsigned int> obj_X(N);
-  for (int n = 0; n < N; n++)
+  for (int n = 0; n < N; n++) {
     obj_X[n] = n;
+  }
   tree.create(obj_X);
 
   // Loop over all points to find nearest neighbors
@@ -333,8 +458,9 @@ void computeGaussianPerplexity(double* X, int N, int D,
   indices.reserve(N);
   distances.reserve(N);
   for (int n = 0; n < N; n++) {
-    if (n % 10000 == 0)
+    if (n % 10000 == 0) {
       fprintf(stderr, " - point %d of %d\n", n, N);
+    }
 
     // Find nearest neighbors
     indices.clear();
@@ -353,16 +479,19 @@ void computeGaussianPerplexity(double* X, int N, int D,
     double sum_P;
     while (!found && iter < 200) {
       // Compute Gaussian kernel row
-      for (int m = 0; m < K; m++)
+      for (int m = 0; m < K; m++) {
         cur_P[m] = exp(-beta * distances[m + 1] * distances[m + 1]);
+      }
 
       // Compute entropy of current row
       sum_P = DBL_MIN;
-      for (int m = 0; m < K; m++)
+      for (int m = 0; m < K; m++) {
         sum_P += cur_P[m];
+      }
       double H = .0;
-      for (int m = 0; m < K; m++)
+      for (int m = 0; m < K; m++) {
         H += beta * (distances[m + 1] * distances[m + 1] * cur_P[m]);
+      }
       H = (H / sum_P) + log(sum_P);
 
       // Evaluate whether the entropy is within the tolerance level
@@ -372,16 +501,18 @@ void computeGaussianPerplexity(double* X, int N, int D,
       } else {
         if (Hdiff > 0) {
           min_beta = beta;
-          if (max_beta == DBL_MAX || max_beta == -DBL_MAX)
+          if (max_beta == DBL_MAX || max_beta == -DBL_MAX) {
             beta *= 2.0;
-          else
+          } else {
             beta = (beta + max_beta) / 2.0;
+          }
         } else {
           max_beta = beta;
-          if (min_beta == -DBL_MAX || min_beta == DBL_MAX)
+          if (min_beta == -DBL_MAX || min_beta == DBL_MAX) {
             beta /= 2.0;
-          else
+          } else {
             beta = (beta + min_beta) / 2.0;
+          }
         }
       }
 
@@ -390,8 +521,9 @@ void computeGaussianPerplexity(double* X, int N, int D,
     }
 
     // Row-normalize current row of P and store in matrix
-    for (unsigned int m = 0; m < K; m++)
+    for (unsigned int m = 0; m < K; m++) {
       cur_P[m] /= sum_P;
+    }
     for (unsigned int m = 0; m < K; m++) {
       col_P[row_P[n] + m] = indices[m + 1];
       val_P[row_P[n] + m] = cur_P[m];
@@ -401,8 +533,9 @@ void computeGaussianPerplexity(double* X, int N, int D,
 
 // Symmetrizes a sparse matrix
 void symmetrizeMatrix(vector<unsigned int>* _row_P,
-                      vector<unsigned int>* _col_P, vector<double>* _val_P,
-                      int N) {
+                      vector<unsigned int>* _col_P,
+                      vector<double>* _val_P,
+                      const int N) {
   // Get sparse matrix
   vector<unsigned int>& row_P = *_row_P;
   vector<unsigned int>& col_P = *_col_P;
@@ -411,24 +544,26 @@ void symmetrizeMatrix(vector<unsigned int>* _row_P,
   // Count number of elements and row counts of symmetric matrix
   vector<int> row_counts(N);
   for (int n = 0; n < N; n++) {
-    for (int i = row_P[n]; i < row_P[n + 1]; i++) {
+    for (unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
       // Check whether element (col_P[i], n) is present
       bool present = false;
-      for (int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
-        if (col_P[m] == n)
+      for (unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
+        if (col_P[m] == n) {
           present = true;
+        }
       }
-      if (present)
+      if (present) {
         row_counts[n]++;
-      else {
+      } else {
         row_counts[n]++;
         row_counts[col_P[i]]++;
       }
     }
   }
   int no_elem = 0;
-  for (int n = 0; n < N; n++)
+  for (int n = 0; n < N; n++) {
     no_elem += row_counts[n];
+  }
 
   // Allocate memory for symmetrized matrix
   vector<unsigned int> sym_row_P(N + 1);
@@ -437,8 +572,9 @@ void symmetrizeMatrix(vector<unsigned int>* _row_P,
 
   // Construct new row indices for symmetric matrix
   sym_row_P[0] = 0;
-  for (int n = 0; n < N; n++)
-    sym_row_P[n + 1] = sym_row_P[n] + (unsigned int)row_counts[n];
+  for (int n = 0; n < N; n++) {
+    sym_row_P[n + 1] = sym_row_P[n] + static_cast<unsigned int>(row_counts[n]);
+  }
 
   // Fill the result matrix
   vector<int> offset(N);
@@ -472,24 +608,28 @@ void symmetrizeMatrix(vector<unsigned int>* _row_P,
       // Update offsets
       if (!present || (present && n <= col_P[i])) {
         offset[n]++;
-        if (col_P[i] != n)
+        if (col_P[i] != n) {
           offset[col_P[i]]++;
+        }
       }
     }
   }
 
   // Divide the result by two
-  for (int i = 0; i < no_elem; i++)
+  for (int i = 0; i < no_elem; i++) {
     sym_val_P[i] /= 2.0;
+  }
 
   // Return symmetrized matrices
-  *_row_P = move(sym_row_P);
-  *_col_P = move(sym_col_P);
-  *_val_P = move(sym_val_P);
+  *_row_P = std::move(sym_row_P);
+  *_col_P = std::move(sym_col_P);
+  *_val_P = std::move(sym_val_P);
 }
 
 // Compute squared Euclidean distance matrix
-vector<double> computeSquaredEuclideanDistance(double* X, int N, int D) {
+vector<double> computeSquaredEuclideanDistance(const double* const X,
+                                               const int N,
+                                               const int D) {
   vector<double> DD(N * N);
   const double* XnD = X;
   for (int n = 0; n < N; ++n, XnD += D) {
@@ -512,7 +652,7 @@ vector<double> computeSquaredEuclideanDistance(double* X, int N, int D) {
 
 // Makes data zero-mean
 template <int D>
-void zeroMean(double* X, int N) {
+void zeroMean(double* const X, const int N) {
   // Compute data mean
   array<double, D> mean;
   mean.fill(0);
@@ -524,7 +664,7 @@ void zeroMean(double* X, int N) {
     nD += D;
   }
   for (int d = 0; d < D; d++) {
-    mean[d] /= (double)N;
+    mean[d] /= static_cast<double>(N);
   }
 
   // Subtract data mean
@@ -538,7 +678,7 @@ void zeroMean(double* X, int N) {
 }
 
 // Makes data zero-mean
-void zeroMean(double* X, int N, int D) {
+void zeroMean(double* const X, const int N, const int D) {
   // Compute data mean
   vector<double> mean(D);
   int nD = 0;
@@ -549,7 +689,7 @@ void zeroMean(double* X, int N, int D) {
     nD += D;
   }
   for (int d = 0; d < D; d++) {
-    mean[d] /= (double)N;
+    mean[d] /= static_cast<double>(N);
   }
 
   // Subtract data mean
@@ -564,28 +704,39 @@ void zeroMean(double* X, int N, int D) {
 
 // Generates a Gaussian random number
 double randn() {
-  double x, y, radius;
+  double x;
+  double y;
+  double radius;
   do {
-    x = 2 * (rand() / ((double)RAND_MAX + 1)) - 1;
-    y = 2 * (rand() / ((double)RAND_MAX + 1)) - 1;
+    x = 2 * (rand() / (static_cast<double>(RAND_MAX) + 1)) - 1;
+    y = 2 * (rand() / (static_cast<double>(RAND_MAX) + 1)) - 1;
     radius = (x * x) + (y * y);
   } while ((radius >= 1.0) || (radius == 0.0));
   radius = sqrt(-2 * log(radius) / radius);
   x *= radius;
-  y *= radius;
   return x;
 }
 
-// Perform t-SNE
+// Initialize t-SNE
 template <int NDIMS>
-void run(double* X, int N, int D, double* Y, double perplexity, double theta,
-         int rand_seed, bool skip_random_init, double* init, bool use_init,
-         int max_iter, int stop_lying_iter, int mom_switch_iter) {
+unique_ptr<TSNEState> make_tsne_impl(double* const X,
+                                     const int N,
+                                     const int D,
+                                     double* const Y,
+                                     const double perplexity,
+                                     const double theta,
+                                     const int rand_seed,
+                                     const bool skip_random_init,
+                                     const double* const init,
+                                     const bool use_init,
+                                     const int max_iter,
+                                     const int stop_lying_iter,
+                                     const int mom_switch_iter) {
   // Set random seed
-  if (skip_random_init != true) {
+  if (!skip_random_init) {
     if (rand_seed >= 0) {
       fprintf(stderr, "Using random seed: %d\n", rand_seed);
-      srand((unsigned int)rand_seed);
+      srand(static_cast<unsigned int>(rand_seed));
     } else {
       fprintf(stderr, "Using current time as random seed...\n");
       srand(time(nullptr));
@@ -597,19 +748,21 @@ void run(double* X, int N, int D, double* Y, double perplexity, double theta,
     fprintf(stderr, "Perplexity too large for the number of data points!\n");
     exit(1);
   }
-  fprintf(stderr, "Using no_dims = %d, perplexity = %f, and theta = %f\n",
-          NDIMS, perplexity, theta);
-  bool exact = (theta == .0) ? true : false;
+  fprintf(stderr,
+          "Using no_dims = %d, perplexity = %f, and theta = %f\n",
+          NDIMS,
+          perplexity,
+          theta);
+  bool exact = theta == .0;
 
   fprintf(stderr,
           "Using max_iter = %d, stop_lying_iter = %d, mom_switch_iter = %d\n",
-          max_iter, stop_lying_iter, mom_switch_iter);
+          max_iter,
+          stop_lying_iter,
+          mom_switch_iter);
 
-  // Set learning parameters
-  float total_time = .0;
-  clock_t start, end;
-  double momentum = .5, final_momentum = .8;
-  double eta = 200.0;
+  clock_t start;
+  clock_t end;
 
   // Allocate some memory
   vector<double> dY(N * NDIMS);
@@ -623,11 +776,13 @@ void run(double* X, int N, int D, double* Y, double perplexity, double theta,
   double max_X = .0;
   for (int i = 0; i < N * D; i++) {
     auto ax = abs(X[i]);
-    if (ax > max_X)
+    if (ax > max_X) {
       max_X = ax;
+    }
   }
-  for (int i = 0; i < N * D; i++)
+  for (int i = 0; i < N * D; i++) {
     X[i] /= max_X;
+  }
 
   // Compute input similarities for exact t-SNE
   vector<double> P;
@@ -653,34 +808,46 @@ void run(double* X, int N, int D, double* Y, double perplexity, double theta,
       nN += N;
     }
     double sum_P = .0;
-    for (int i = 0; i < N * N; i++)
+    for (int i = 0; i < N * N; i++) {
       sum_P += P[i];
-    for (int i = 0; i < N * N; i++)
+    }
+    for (int i = 0; i < N * N; i++) {
       P[i] /= sum_P;
+    }
   } else {
     // Compute input similarities for approximate t-SNE
 
     // Compute asymmetric pairwise input similarities
-    computeGaussianPerplexity(X, N, D, &row_P, &col_P, &val_P, perplexity,
-                              (int)(3 * perplexity));
+    computeGaussianPerplexity(X,
+                              N,
+                              D,
+                              &row_P,
+                              &col_P,
+                              &val_P,
+                              perplexity,
+                              static_cast<int>(3 * perplexity));
 
     // Symmetrize input similarities
     symmetrizeMatrix(&row_P, &col_P, &val_P, N);
     double sum_P = .0;
-    for (int i = 0; i < row_P[N]; i++)
+    for (int i = 0; i < row_P[N]; i++) {
       sum_P += val_P[i];
-    for (int i = 0; i < row_P[N]; i++)
+    }
+    for (int i = 0; i < row_P[N]; i++) {
       val_P[i] /= sum_P;
+    }
   }
   end = clock();
 
   // Lie about the P-values
   if (exact) {
-    for (int i = 0; i < N * N; i++)
+    for (int i = 0; i < N * N; i++) {
       P[i] *= 12.0;
+    }
   } else {
-    for (int i = 0; i < row_P[N]; i++)
+    for (int i = 0; i < row_P[N]; i++) {
       val_P[i] *= 12.0;
+    }
   }
 
   // Initialize solution (randomly or with given coordinates)
@@ -695,39 +862,80 @@ void run(double* X, int N, int D, double* Y, double perplexity, double theta,
   }
 
   // Perform main training loop
-  if (exact)
+  if (exact) {
     fprintf(stderr,
             "Input similarities computed in %4.2f seconds!\nLearning "
             "embedding...\n",
-            (float)(end - start) / CLOCKS_PER_SEC);
-  else
+            static_cast<float>(end - start) / CLOCKS_PER_SEC);
+  } else {
     fprintf(stderr,
             "Input similarities computed in %4.2f seconds (sparsity = "
             "%f)!\nLearning embedding...\n",
-            (float)(end - start) / CLOCKS_PER_SEC,
-            (double)row_P[N] / ((double)N * (double)N));
-  start = clock();
+            static_cast<float>(end - start) / CLOCKS_PER_SEC,
+            static_cast<double>(row_P[N]) /
+                (static_cast<double>(N) * static_cast<double>(N)));
+  }
 
-  for (int iter = 0; iter < max_iter; iter++) {
+  return std::make_unique<TSNEState>(N,
+                                     Y,
+                                     NDIMS,
+                                     theta,
+                                     max_iter,
+                                     stop_lying_iter,
+                                     mom_switch_iter,
+                                     0,
+                                     .0,
+                                     .0,
+                                     std::move(P),
+                                     std::move(row_P),
+                                     std::move(col_P),
+                                     std::move(val_P),
+                                     std::move(dY),
+                                     std::move(uY),
+                                     std::move(gains));
+}
+
+// Optimize t-SNE
+template <int NDIMS>
+bool TSNEState::step_by_impl(const int step) {
+  // Set learning parameters
+  double momentum = .5;
+  double final_momentum = .8;
+  double eta = 200.0;
+
+  // Extract state
+  bool exact = theta == .0;
+  clock_t start = clock();
+  clock_t end;
+  float elapsed;
+  int iter_until = std::min(iter + step, max_iter);
+
+  for (; iter < iter_until; iter++) {
     // Compute (approximate) gradient
-    if (exact)
-      computeExactGradient<NDIMS>(P, Y, N, dY);
-    else
-      computeGradient<NDIMS>(row_P, col_P, val_P, Y, N, dY, theta);
+    if (exact) {
+      computeExactGradient<NDIMS>(P, Y, N, &dY);
+    } else {
+      computeGradient<NDIMS>(row_P, col_P, val_P, Y, N, &dY, theta);
+    }
 
     // Update gains
-    for (int i = 0; i < N * NDIMS; i++)
+    for (int i = 0; i < N * NDIMS; i++) {
       gains[i] =
           (sign(dY[i]) != sign(uY[i])) ? (gains[i] + .2) : (gains[i] * .8);
-    for (int i = 0; i < N * NDIMS; i++)
-      if (gains[i] < .01)
+    }
+    for (int i = 0; i < N * NDIMS; i++) {
+      if (gains[i] < .01) {
         gains[i] = .01;
+      }
+    }
 
     // Perform gradient update (with momentum and gains)
-    for (int i = 0; i < N * NDIMS; i++)
+    for (int i = 0; i < N * NDIMS; i++) {
       uY[i] = momentum * uY[i] - eta * gains[i] * dY[i];
-    for (int i = 0; i < N * NDIMS; i++)
+    }
+    for (int i = 0; i < N * NDIMS; i++) {
       Y[i] = Y[i] + uY[i];
+    }
 
     // Make solution zero-mean
     zeroMean<NDIMS>(Y, N);
@@ -735,15 +943,18 @@ void run(double* X, int N, int D, double* Y, double perplexity, double theta,
     // Stop lying about the P-values after a while, and switch momentum
     if (iter == stop_lying_iter) {
       if (exact) {
-        for (int i = 0; i < N * N; i++)
+        for (int i = 0; i < N * N; i++) {
           P[i] /= 12.0;
+        }
       } else {
-        for (int i = 0; i < row_P[N]; i++)
+        for (int i = 0; i < row_P[N]; i++) {
           val_P[i] /= 12.0;
+        }
       }
     }
-    if (iter == mom_switch_iter)
+    if (iter == mom_switch_iter) {
       momentum = final_momentum;
+    }
 
     // Print out progress
     if (iter > 0 && (iter % 50 == 0 || iter == max_iter - 1)) {
@@ -755,46 +966,172 @@ void run(double* X, int N, int D, double* Y, double perplexity, double theta,
         // doing approximate computation here!
         C = evaluateError<NDIMS>(row_P, col_P, val_P, Y, N, theta);
       }
-      if (iter == 0)
+      if (iter == 0) {
         fprintf(stderr, "Iteration %d: error is %f\n", iter + 1, C);
-      else {
-        total_time += (float)(end - start) / CLOCKS_PER_SEC;
+      } else {
+        elapsed = static_cast<float>(end - start) / CLOCKS_PER_SEC;
+        total_time += elapsed;
+        residual_time += elapsed;
         fprintf(stderr,
                 "Iteration %d: error is %f (50 iterations in %4.2f seconds)\n",
-                iter, C, (float)(end - start) / CLOCKS_PER_SEC);
+                iter,
+                C,
+                residual_time);
+        residual_time = 0;
       }
       start = clock();
     }
   }
   end = clock();
-  total_time += (float)(end - start) / CLOCKS_PER_SEC;
+  elapsed = static_cast<float>(end - start) / CLOCKS_PER_SEC;
+  total_time += elapsed;
+  residual_time += elapsed;
 
-  fprintf(stderr, "Fitting performed in %4.2f seconds.\n", total_time);
+  if (iter >= max_iter) {
+    fprintf(stderr, "Fitting performed in %4.2f seconds.\n", total_time);
+    return true;
+  }
+  return false;
 }
 
-}  // namespace
-
-extern "C" {
+unique_ptr<TSNEState> make_tsne(double* const X,
+                                const int N,
+                                const int D,
+                                double* const Y,
+                                const int no_dims,
+                                const double perplexity,
+                                const double theta,
+                                const int rand_seed,
+                                const bool skip_random_init,
+                                const double* const init,
+                                const bool use_init,
+                                const int max_iter,
+                                const int stop_lying_iter,
+                                const int mom_switch_iter) {
+  switch (no_dims) {
+    case 2:
+      return make_tsne_impl<2>(X,
+                               N,
+                               D,
+                               Y,
+                               perplexity,
+                               theta,
+                               rand_seed,
+                               skip_random_init,
+                               init,
+                               use_init,
+                               max_iter,
+                               stop_lying_iter,
+                               mom_switch_iter);
+    case 3:
+      return make_tsne_impl<3>(X,
+                               N,
+                               D,
+                               Y,
+                               perplexity,
+                               theta,
+                               rand_seed,
+                               skip_random_init,
+                               init,
+                               use_init,
+                               max_iter,
+                               stop_lying_iter,
+                               mom_switch_iter);
+    default:
+      throw "unsupported dimension";
+  }
+}
 
-void DLL_PUBLIC run(double* X, int N, int D, double* Y, int no_dims,
-                    double perplexity, double theta, int rand_seed,
-                    bool skip_random_init, double* init, bool use_init,
-                    int max_iter, int stop_lying_iter, int mom_switch_iter) {
-  assert(no_dims == 2 || no_dims == 3);
+bool TSNEState::step_by(int step) {
   switch (no_dims) {
     case 2:
-      run<2>(X, N, D, Y, perplexity, theta, rand_seed, skip_random_init, init,
-             use_init, max_iter, stop_lying_iter, mom_switch_iter);
-      break;
+      return step_by_impl<2>(step);
     case 3:
-      run<3>(X, N, D, Y, perplexity, theta, rand_seed, skip_random_init, init,
-             use_init, max_iter, stop_lying_iter, mom_switch_iter);
-      break;
+      return step_by_impl<3>(step);
     default:
-      throw "invalid dimension";
+      throw "unsupported dimension";
+  }
+}
+
+}  // namespace
+
+extern "C" {
+DLL_PUBLIC struct TSNE* init_tsne(double* const X,
+                                  const int N,
+                                  const int D,
+                                  double* const Y,
+                                  const int no_dims,
+                                  const double perplexity,
+                                  const double theta,
+                                  const int rand_seed,
+                                  const bool skip_random_init,
+                                  const double* const init,
+                                  const bool use_init,
+                                  const int max_iter,
+                                  const int stop_lying_iter,
+                                  const int mom_switch_iter) {
+  assert(no_dims == 2 || no_dims == 3);
+  auto tsne = make_tsne(X,
+                        N,
+                        D,
+                        Y,
+                        no_dims,
+                        perplexity,
+                        theta,
+                        rand_seed,
+                        skip_random_init,
+                        init,
+                        use_init,
+                        max_iter,
+                        stop_lying_iter,
+                        mom_switch_iter);
+  return reinterpret_cast<TSNE*>(tsne.release());
+}
+
+DLL_PUBLIC bool step_tsne_by(struct TSNE* const tsne, const int step) {
+  return reinterpret_cast<TSNEState*>(tsne)->step_by(step);
+}
+
+DLL_PUBLIC void free_tsne(struct TSNE* const tsne) {
+  if (tsne == nullptr) {
+    return;
   }
+  delete reinterpret_cast<TSNEState*>(tsne);
+}
+
+DLL_PUBLIC void run(double* const X,
+                    const int N,
+                    const int D,
+                    double* const Y,
+                    const int no_dims,
+                    const double perplexity,
+                    const double theta,
+                    const int rand_seed,
+                    const bool skip_random_init,
+                    const double* const init,
+                    const bool use_init,
+                    const int max_iter,
+                    const int stop_lying_iter,
+                    const int mom_switch_iter) {
+  auto tsne = make_tsne(X,
+                        N,
+                        D,
+                        Y,
+                        no_dims,
+                        perplexity,
+                        theta,
+                        rand_seed,
+                        skip_random_init,
+                        init,
+                        use_init,
+                        max_iter,
+                        stop_lying_iter,
+                        mom_switch_iter);
+  tsne->step_by(max_iter);
 }
 
 }  // extern "C"
 
+#ifdef __GNUC__
 #pragma GCC visibility pop
+#endif
diff --git a/tsne/bh_sne_src/tsne.h b/tsne/bh_sne_src/tsne.h
index 6ec8e05..8ed5462 100644
--- a/tsne/bh_sne_src/tsne.h
+++ b/tsne/bh_sne_src/tsne.h
@@ -35,26 +35,64 @@
 #define TSNE_H
 
 #if defined _WIN32 || defined __CYGWIN__
+#ifdef BUILDING_TSNE_DLL
 #ifdef __GNUC__
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
 #define DLL_PUBLIC __attribute__((dllexport))
 #else
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
 #define DLL_PUBLIC __declspec(dllexport)
-#endif
+#endif  // __GNUC__
+#else
+#ifdef __GNUC__
+#define DLL_PUBLIC __attribute__ ((dllimport))
+#else
+#define DLL_PUBLIC __declspec(dllimport)
+#endif  // __GNUC__
+#endif  // BUILDING_TSNE_DLL
 #else
 #if __GNUC__ >= 4
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
 #define DLL_PUBLIC __attribute__((visibility("default")))
 #else
 #define DLL_PUBLIC
-#endif
-#endif
+#endif  // __GNUC__
+#endif  // defined _WIN32 || defined __CYGWIN__
 
 extern "C" {
-
-void DLL_PUBLIC run(double* X, int N, int D, double* Y, int no_dims,
-                    double perplexity, double theta, int rand_seed,
-                    bool skip_random_init, double* init, bool use_init,
-                    int max_iter = 1000, int stop_lying_iter = 250,
+// stateless t-SNE
+DLL_PUBLIC void run(double* X,
+                    int N,
+                    int D,
+                    double* Y,
+                    int no_dims,
+                    double perplexity,
+                    double theta,
+                    int rand_seed,
+                    bool skip_random_init,
+                    const double* init,
+                    bool use_init,
+                    int max_iter = 1000,
+                    int stop_lying_iter = 250,
                     int mom_switch_iter = 250);
+// stateful t-SNE
+struct TSNE;
+DLL_PUBLIC struct TSNE* init_tsne(double* X,
+                                  int N,
+                                  int D,
+                                  double* Y,
+                                  int no_dims,
+                                  double perplexity,
+                                  double theta,
+                                  int rand_seed,
+                                  bool skip_random_init,
+                                  const double* init,
+                                  bool use_init,
+                                  int max_iter = 1000,
+                                  int stop_lying_iter = 250,
+                                  int mom_switch_iter = 250);
+DLL_PUBLIC bool step_tsne_by(struct TSNE* tsne, int step);
+DLL_PUBLIC void free_tsne(struct TSNE* tsne);
 }
 
-#endif
+#endif  // !defined(TSNE_H)
diff --git a/tsne/bh_sne_src/vptree.h b/tsne/bh_sne_src/vptree.h
index c9387ce..ec50da0 100644
--- a/tsne/bh_sne_src/vptree.h
+++ b/tsne/bh_sne_src/vptree.h
@@ -45,22 +45,27 @@
 #include <queue>
 #include <vector>
 
+#ifdef __GNUC__
 #pragma GCC visibility push(hidden)
+#endif
 
 class euclidean_distance final {
-  const int _D;
-  const double* data;
+  const int D;
+  const double* const data;
 
  public:
-  explicit euclidean_distance(int D, const double* data) : _D(D), data(data) {
+  explicit euclidean_distance(int D, const double* data) : D(D), data(data) {
   }
-  euclidean_distance(euclidean_distance&&) = default;
   euclidean_distance(const euclidean_distance&) = default;
+  euclidean_distance(euclidean_distance&&) = delete;
+  euclidean_distance& operator=(const euclidean_distance&) = delete;
+  euclidean_distance& operator=(euclidean_distance&&) = delete;
+  ~euclidean_distance() = default;
   double operator()(const unsigned int t1, const unsigned int t2) const {
     double dd = .0;
-    const double* x1 = data+t1*_D;
-    const double* x2 = data+t2*_D;
-    for (int d = 0; d < _D; d++) {
+    const double* x1 = data + t1 * D;
+    const double* x2 = data + t2 * D;
+    for (int d = 0; d < D; d++) {
       double diff = (x1[d] - x2[d]);
       dd += diff * diff;
     }
@@ -74,6 +79,11 @@ class VpTree {
   // Default constructor
   explicit VpTree(Distance&& distance) : distance(distance){};
 
+  VpTree(const VpTree<T, Distance>&) = delete;
+  VpTree(VpTree<T, Distance>&&) noexcept = default;
+  VpTree& operator=(const VpTree<T, Distance>&) = delete;
+  VpTree& operator=(VpTree<T, Distance>&&) noexcept = default;
+
   // Destructor
   ~VpTree() = default;
 
@@ -84,7 +94,9 @@ class VpTree {
   }
 
   // Function that uses the tree to find the k nearest neighbors of target
-  void search(const T& target, int k, std::vector<T>* results,
+  void search(const T& target,
+              int k,
+              std::vector<T>* results,
               std::vector<double>* distances) {
     // Use a priority queue to store intermediate results on
     std::priority_queue<HeapItem> heap;
@@ -117,13 +129,16 @@ class VpTree {
   // Single node of a VP tree (has a point and radius; left children are closer
   // to point than the radius)
   struct Node {
-    int index;                    // index of point in node
-    double threshold;             // radius(?)
+    int index = 0;                // index of point in node
+    double threshold = 0;         // radius(?)
     std::unique_ptr<Node> left;   // points closer by than threshold
     std::unique_ptr<Node> right;  // points farther away than threshold
 
-    Node() : index(0), threshold(0.) {
-    }
+    Node() = default;
+    Node(Node&&) noexcept = default;
+    Node(const Node&) = delete;
+    Node& operator=(Node&&) noexcept = default;
+    Node& operator=(const Node&) = delete;
 
     ~Node() = default;
   };
@@ -153,17 +168,21 @@ class VpTree {
     if (upper - lower > 1) {  // if we did not arrive at leaf yet
 
       // Choose an arbitrary point and move it to the start
-      int i = (int)((double)rand() / RAND_MAX * (upper - lower - 1)) + lower;
+      int i = static_cast<int>(static_cast<double>(rand()) / RAND_MAX *
+                               (upper - lower - 1)) +
+              lower;
       std::swap(_items[lower], _items[i]);
 
       // Partition around the median distance
       int median = (upper + lower) / 2;
       auto& lower_item = _items[lower];
-      std::nth_element(
-          _items.begin() + lower + 1, _items.begin() + median,
-          _items.begin() + upper, [this, lower_item](auto& x, auto& y) {
-            return distance(lower_item, x) < distance(lower_item, y);
-          });
+      std::nth_element(_items.begin() + lower + 1,
+                       _items.begin() + median,
+                       _items.begin() + upper,
+                       [this, lower_item](auto& x, auto& y) {
+                         return distance(lower_item, x) <
+                                distance(lower_item, y);
+                       });
 
       // Threshold of the new node will be the distance to the median
       node->threshold = distance(lower_item, _items[median]);
@@ -179,10 +198,13 @@ class VpTree {
   }
 
   // Helper function that searches the tree
-  void search(const Node* node, const T& target, int k,
+  void search(const Node* node,
+              const T& target,
+              int k,
               std::priority_queue<HeapItem>& heap) {
-    if (node == nullptr)
+    if (node == nullptr) {
       return;  // indicates that we're done here
+    }
 
     // Compute distance between target and current node
     double dist = distance(_items[node->index], target);
@@ -236,10 +258,10 @@ class VpTree {
       }
     }
   }
-
-  VpTree(const VpTree<T, Distance>&) = delete;
-  VpTree& operator=(const VpTree<T, Distance>&) = delete;
 };
 
+#ifdef __GNUC__
 #pragma GCC visibility pop
+#endif
+
 #endif  // !defined(VPTREE_H)
diff --git a/tsne/tests/test_stability.npy b/tsne/tests/test_stability.npy
index 25cd4de..e080c6f 100644
Binary files a/tsne/tests/test_stability.npy and b/tsne/tests/test_stability.npy differ