diff --git a/milk/tests/test_kmeans.py b/milk/tests/test_kmeans.py
index 4edbc02..38e55b3 100644
--- a/milk/tests/test_kmeans.py
+++ b/milk/tests/test_kmeans.py
@@ -54,6 +54,6 @@ def test_kmeans_return_partial():
     assignments,centroids = milk.unsupervised.kmeans(features, 2, R=129)
     centroids_ = milk.unsupervised.kmeans(features, 2, R=129, return_assignments=False)
     assignments_ = milk.unsupervised.kmeans(features, 2, R=129, return_centroids=False)
-    assert np.all(centroids == centroids_)
     assert np.all(assignments == assignments_)
+    assert np.allclose(centroids, centroids_)
 
diff --git a/milk/unsupervised/_kmeans.cpp b/milk/unsupervised/_kmeans.cpp
index 1902159..b6939e4 100644
--- a/milk/unsupervised/_kmeans.cpp
+++ b/milk/unsupervised/_kmeans.cpp
@@ -20,8 +20,79 @@ void assert_type_contiguous(PyArrayObject* array,int type) {
 }
 
 template <typename ftype>
-int computecentroids(ftype* f, ftype* centroids, PyArrayObject* a_assignments, PyArrayObject* a_counts, const int N, const int Nf, const int k) {
+bool assignclass_euclidian(ftype* f, ftype* centroids, PyArrayObject* a_assignments, const int N, const int Nf, const int k) {
+    Py_BEGIN_ALLOW_THREADS
+    npy_int32* assignments = static_cast<npy_int32*>(PyArray_DATA(a_assignments));
+
+    #pragma parallel for schedule(static)
+    for (int i=0; i < N; i++) {
+        int best_cluster = -1;
+        ftype min_distance = 0.0;
+        for (int c=0; c < k; c++) {
+            ftype cur_distance = 0.0;
+            for (int j=0; j < Nf; j++) {
+                ftype term = (f[i * Nf + j] - centroids[c * Nf + j]);
+                cur_distance += term * term;
+            }
+            if (best_cluster == -1 || cur_distance < min_distance) {
+                min_distance = cur_distance;
+                best_cluster = c;
+            }
+        }
+        assignments[i] = 1; //best_cluster;
+    }
 
+    return true;
+    Py_END_ALLOW_THREADS
+}
+
+PyObject* py_assignclass_euclidian(PyObject* self, PyObject* args) {
+    try {
+        PyArrayObject* fmatrix;
+        PyArrayObject* centroids;
+        PyArrayObject* assignments;
+        if (!PyArg_ParseTuple(args, "OOO", &fmatrix, &centroids, &assignments)) { throw Kmeans_Exception("Wrong number of arguments for assignclass_euclidian."); }
+        if (!PyArray_Check(fmatrix) || !PyArray_ISCONTIGUOUS(fmatrix)) throw Kmeans_Exception("fmatrix not what was expected.");
+        if (!PyArray_Check(centroids) || !PyArray_ISCONTIGUOUS(centroids)) throw Kmeans_Exception("centroids not what was expected.");
+        if (!PyArray_Check(assignments) || !PyArray_ISCONTIGUOUS(assignments)) throw Kmeans_Exception("assignments not what was expected.");
+        if (PyArray_TYPE(assignments) != NPY_INT32) throw Kmeans_Exception("assignments should be int32.");
+        if (PyArray_TYPE(fmatrix) != PyArray_TYPE(centroids)) throw Kmeans_Exception("centroids and fmatrix should have same type.");
+        if (PyArray_NDIM(fmatrix) != 2) throw Kmeans_Exception("fmatrix should be two dimensional");
+        if (PyArray_NDIM(centroids) != 2) throw Kmeans_Exception("centroids should be two dimensional");
+        if (PyArray_NDIM(assignments) != 1) throw Kmeans_Exception("assignments should be two dimensional");
+
+        const int N = PyArray_DIM(fmatrix, 0);
+        const int Nf = PyArray_DIM(fmatrix, 1);
+        const int k = PyArray_DIM(centroids, 0);
+        if (PyArray_DIM(centroids, 1) != Nf) throw Kmeans_Exception("centroids has wrong number of features.");
+        if (PyArray_DIM(assignments, 0) != N) throw Kmeans_Exception("assignments has wrong size.");
+        switch (PyArray_TYPE(fmatrix)) {
+#define TRY_TYPE_ASSIGN(code, ftype) \
+            case code: \
+                if (assignclass_euclidian<ftype>( \
+                        static_cast<ftype*>(PyArray_DATA(fmatrix)), \
+                        static_cast<ftype*>(PyArray_DATA(centroids)), \
+                        assignments, \
+                        N, Nf, k)) { \
+                        Py_RETURN_TRUE; \
+                } \
+                Py_RETURN_FALSE;
+
+            TRY_TYPE_ASSIGN(NPY_FLOAT, float);
+            TRY_TYPE_ASSIGN(NPY_DOUBLE, double);
+        }
+        throw Kmeans_Exception("Cannot handle this type.");
+    } catch (const Kmeans_Exception& exc) {
+        PyErr_SetString(PyExc_RuntimeError,exc.msg);
+        return 0;
+    } catch (...) {
+        PyErr_SetString(PyExc_RuntimeError,"Some sort of exception in assignclass_euclidian.");
+        return 0;
+    }
+}
+
+template <typename ftype>
+int computecentroids(ftype* f, ftype* centroids, PyArrayObject* a_assignments, PyArrayObject* a_counts, const int N, const int Nf, const int k) {
     int zero_counts = 0;
     Py_BEGIN_ALLOW_THREADS
     const npy_int32* assignments = static_cast<npy_int32*>(PyArray_DATA(a_assignments));
@@ -30,16 +101,42 @@ int computecentroids(ftype* f, ftype* centroids, PyArrayObject* a_assignments, P
     std::fill(counts, counts + k, 0);
     std::fill(centroids, centroids + k*Nf, 0.0);
 
-    for (int i = 0; i != N; i++){
-        const int c = assignments[i];
-        if (c >= k) continue; // throw Kmeans_Exception("wrong value in assignments");
-        ftype* ck = centroids + Nf*c;
-        for (int j = 0; j != Nf; ++j) {
-            *ck++ += *f++;
+    #pragma omp parallel shared(assignments, counts, centroids, f)
+    {
+        int *local_counts = (int*) malloc(k * sizeof(int));
+        ftype *local_ck = (ftype*) malloc(k * Nf * sizeof(ftype));
+        std::fill(local_counts, local_counts + k, 0);
+        std::fill(local_ck, local_ck + k*Nf, ftype());
+
+        #pragma omp for schedule(static)
+        for (int i = 0; i < N; i++){
+            const int c = assignments[i];
+            if (c >= k) continue; // throw Kmeans_Exception("wrong value in assignments");
+            ftype* lck = local_ck + Nf*c;
+            for (int j = 0; j != Nf; ++j) {
+                *lck++ += f[i*Nf + j];
+            }
+            ++local_counts[c];
         }
-        ++counts[c];
+
+        ftype* ck = centroids;
+        ftype* lck = local_ck;
+        for (int c=0; c < k; c++) {
+            #pragma omp critical(reduce)
+            {
+                for (int j = 0; j != Nf; ++j) {
+                    *ck++ += *lck++;
+                }
+                counts[c] += local_counts[c];
+            }
+        }
+
+        free(local_counts);
+        free(local_ck);
     }
-    for (int i = 0; i != k; ++i) {
+
+    #pragma omp parallel for reduction(+:zero_counts) schedule(static) shared(counts,centroids) 
+    for (int i = 0; i < k; ++i) {
         ftype* ck = centroids + Nf*i;
         const ftype c = counts[i];
         if (c == 0) {
@@ -80,7 +177,7 @@ PyObject* py_computecentroids(PyObject* self, PyObject* args) {
         if (PyArray_DIM(assignments, 0) != N) throw Kmeans_Exception("assignments has wrong size.");
         if (PyArray_DIM(counts, 0) != k) throw Kmeans_Exception("counts has wrong size.");
         switch (PyArray_TYPE(fmatrix)) {
-#define TRY_TYPE(code, ftype) \
+#define TRY_TYPE_CENTROIDS(code, ftype) \
             case code: \
                 if (computecentroids<ftype>( \
                         static_cast<ftype*>(PyArray_DATA(fmatrix)), \
@@ -92,8 +189,8 @@ PyObject* py_computecentroids(PyObject* self, PyObject* args) {
                 } \
                 Py_RETURN_FALSE;
 
-            TRY_TYPE(NPY_FLOAT, float);
-            TRY_TYPE(NPY_DOUBLE, double);
+            TRY_TYPE_CENTROIDS(NPY_FLOAT, float);
+            TRY_TYPE_CENTROIDS(NPY_DOUBLE, double);
         }
         throw Kmeans_Exception("Cannot handle this type.");
     } catch (const Kmeans_Exception& exc) {
@@ -107,6 +204,7 @@ PyObject* py_computecentroids(PyObject* self, PyObject* args) {
 
 PyMethodDef methods[] = {
   {"computecentroids", py_computecentroids, METH_VARARGS , "Do NOT call directly.\n" },
+  {"assignclass_euclidian", py_assignclass_euclidian, METH_VARARGS , "Do NOT call directly.\n" },
   {NULL, NULL,0,NULL},
 };
 
diff --git a/milk/unsupervised/kmeans.py b/milk/unsupervised/kmeans.py
index d8bf270..a1324a9 100644
--- a/milk/unsupervised/kmeans.py
+++ b/milk/unsupervised/kmeans.py
@@ -219,13 +219,13 @@ def kmeans(fmatrix, k, distance='euclidean', max_iter=1000, R=None, return_assig
         fmatrix = zscore(fmatrix)
         distance = 'euclidean'
     if distance == 'euclidean':
-        def distfunction(fmatrix, cs, dists):
-            dists = _dot3(fmatrix, (-2)*cs.T, dists)
+        def distfunction(fmatrix, cs, _):
+            dists = _dot3(fmatrix, (-2)*cs.T, None)
             dists += np.array([np.dot(c,c) for c in cs])
             # For a distance, we'd need to add the fmatrix**2 components, but
             # it doesn't matter because we are going to perform an argmin() on
             # the result.
-            return dists
+            return dists.argmin(1)
     elif distance == 'mahalanobis':
         icov = kwargs.get('icov', None)
         if icov is None:
@@ -234,13 +234,18 @@ def distfunction(fmatrix, cs, dists):
                 covmat = np.cov(fmatrix.T)
             icov = linalg.inv(covmat)
         def distfunction(fmatrix, cs, _):
-            return np.array([_mahalanobis2(fmatrix, c, icov) for c in cs]).T
+            dists = np.array([_mahalanobis2(fmatrix, c, icov) for c in cs]).T
+            return dists.argmin(1)
     else:
         raise ValueError('milk.unsupervised.kmeans: `distance` argument unknown (%s)' % distance)
     if k < 2:
         raise ValueError('milk.unsupervised.kmeans `k` should be >= 2.')
     if fmatrix.dtype in (np.float32, np.float64) and fmatrix.flags['C_CONTIGUOUS']:
         computecentroids = _kmeans.computecentroids
+        if distance == 'euclidian':
+            def distfunction(fmatrix, centroids, assignments):
+                _kmeans.assignclass_euclidian(fmatrix, centroids, assignments)
+                return assignments
     else:
         computecentroids = _pycomputecentroids
 
@@ -256,19 +261,16 @@ def distfunction(fmatrix, cs, _):
 
     prev = np.zeros(len(fmatrix), np.int32)
     counts = np.empty(k, np.int32)
-    dists = None
+    assignments = np.empty((fmatrix.shape[0],), dtype=np.int32)
     for i in xrange(max_iter):
-        dists = distfunction(fmatrix, centroids, dists)
-        assignments = dists.argmin(1)
+        assignments = distfunction(fmatrix, centroids, assignments)
         if np.all(assignments == prev):
             break
-        if computecentroids(fmatrix, centroids, assignments.astype(np.int32), counts):
+        if computecentroids(fmatrix, centroids, assignments.astype(np.int32, copy=False), counts):
             (empty,) = np.where(counts == 0)
             centroids = np.delete(centroids, empty, axis=0)
             k = len(centroids)
             counts = np.empty(k, np.int32)
-            # This will cause new matrices to be allocated in the next iteration
-            dists = None
         prev[:] = assignments
     if return_centroids and return_assignments:
         return assignments, centroids
diff --git a/setup.py b/setup.py
index 911eeac..7e0b1d2 100644
--- a/setup.py
+++ b/setup.py
@@ -57,7 +57,8 @@
         'milk.supervised._lasso' : ['milk/supervised/_lasso.cpp'],
 }
 
-compiler_args = ['-std=c++0x']
+compiler_args = ['-std=c++0x', '-fopenmp']
+link_args=['-lgomp']
 if platform.system() == 'Darwin':
   compiler_args.append('-stdlib=libc++')
 
@@ -67,6 +68,7 @@
                 undef_macros=undef_macros,
                 define_macros=define_macros,
                 extra_compile_args=compiler_args,
+                extra_link_args=link_args,
                 )
         for key,sources in _extensions.items()
 ]