From e20a94ddab478c8698a5fb534f2949eb80d22446 Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Fri, 6 Mar 2015 11:34:55 +0100
Subject: [PATCH 1/2] performance improvements to _factorize_str_helper

Speedup ca. x1.5 cf. master branch on my machine with compressed bcolz
Approximate contributions:
 - 1/3: direct indexing into arrays using typed memoryviews
 - 1/3: substitution of reverse dict with std::vector objects
 - 1/3: use of nested `with nogil`, `with gil` construct
---
 bquery/ctable_ext.pyx | 107 +++++++++++++++++++++++++++++-------------
 bquery/khash.pxd      |   4 +-
 setup.py              |   3 +-
 3 files changed, 78 insertions(+), 36 deletions(-)
diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index 1e4becb..daf2a35 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -1,8 +1,9 @@
 import numpy as np
 import cython
-from numpy cimport ndarray, dtype, npy_intp, npy_int32, npy_uint64, npy_int64, npy_float64
-from libc.stdlib cimport malloc
-from libc.string cimport strcpy
+from numpy cimport ndarray, dtype, npy_intp, npy_int32, npy_uint64, npy_int64, npy_float64, uint64_t
+from libc.stdlib cimport malloc, calloc, free
+from libc.string cimport strcpy, memcpy
+from libcpp.vector cimport vector
 from khash cimport *
 from bcolz.carray_ext cimport carray, chunk
 
@@ -31,12 +32,13 @@ DEF _SORTED_COUNT_DISTINCT = 4
 @cython.boundscheck(False)
 cdef void _factorize_str_helper(Py_ssize_t iter_range,
                        Py_ssize_t allocation_size,
-                       ndarray in_buffer,
-                       ndarray[npy_uint64] out_buffer,
+                       char[:, :] in_buffer,
+                       uint64_t[:] out_buffer,
                        kh_str_t *table,
                        Py_ssize_t * count,
-                       dict reverse,
-                       ):
+                       vector[Py_ssize_t] & reverse_keys,
+                       vector[char *] & reverse_values,
+                       ) nogil:
     cdef:
         Py_ssize_t i, idx
         int ret
@@ -45,10 +47,16 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range,
         khiter_t k
 
     ret = 0
+    # allocate enough memory to hold the string element, add one for the
+    # null byte that marks the end of the string.
+    # TODO: understand why zero-filling is necessary. Without zero-filling
+    # the buffer, duplicate keys occur in the reverse dict
+    element = <char *>calloc(allocation_size, sizeof(char))
 
     for i in range(iter_range):
-        # TODO: Consider indexing directly into the array for efficiency
-        element = in_buffer[i]
+        # strings are stored without null termination in ndarrays: need a 
+        # buffer to append null termination to use usual string algorithms
+        memcpy(element, &(in_buffer[i, 0]), in_buffer.shape[1])
         k = kh_get_str(table, element)
         if k != table.n_buckets:
             idx = table.vals[k]
@@ -60,24 +68,33 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range,
             strcpy(insert, element)
             k = kh_put_str(table, insert, &ret)
             table.vals[k] = idx = count[0]
-            reverse[count[0]] = element
+            reverse_keys.push_back(count[0])
+            reverse_values.push_back(insert)
             count[0] += 1
         out_buffer[i] = idx
 
+    free(element)
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def factorize_str(carray carray_, carray labels=None):
     cdef:
         chunk chunk_
-        Py_ssize_t n, i, count, chunklen, leftover_elements
+        Py_ssize_t n, i, count, chunklen, leftover_elements, allocation_size, \
+                   nchunks
+        vector[Py_ssize_t] reverse_keys
+        vector[char *] reverse_values
         dict reverse
         ndarray in_buffer
+        char[:, :] in_buffer_view
         ndarray[npy_uint64] out_buffer
+        uint64_t[:] out_buffer_view
         kh_str_t *table
 
     count = 0
     ret = 0
     reverse = {}
+    nchunks = carray_.nchunks
 
     n = len(carray_)
     chunklen = carray_.chunklen
@@ -85,40 +102,64 @@ def factorize_str(carray carray_, carray labels=None):
         labels = carray([], dtype='int64', expectedlen=n)
     # in-buffer isn't typed, because cython doesn't support string arrays (?)
     out_buffer = np.empty(chunklen, dtype='uint64')
+    # initialise cython typed memoryview to allow indexing directly into array
+    out_buffer_view = out_buffer
     in_buffer = np.empty(chunklen, dtype=carray_.dtype)
     table = kh_init_str()
 
-    for i in range(carray_.nchunks):
-        chunk_ = carray_.chunks[i]
-        # decompress into in_buffer
-        chunk_._getitem(0, chunklen, in_buffer.data)
-        _factorize_str_helper(chunklen,
-                        carray_.dtype.itemsize + 1,
-                        in_buffer,
-                        out_buffer,
-                        table,
-                        &count,
-                        reverse,
-                        )
-        # compress out_buffer into labels
-        labels.append(out_buffer.astype(np.int64))
+    # the uint8 view is a workaround to allow the definition of a reshaped 
+    # cython memoryview on string elements contained in a ndarray. 
+    # This allows convenient indexing in the form var[element no, string pos]
+    in_buffer_view = in_buffer.view('uint8') \
+                              .reshape(chunklen, carray_.dtype.itemsize)
+    allocation_size = in_buffer_view.shape[1] + 1
+    with nogil:
+        for i in xrange(nchunks):
+            with gil:
+                chunk_ = carray_.chunks[i]
+                # decompress into in_buffer
+                # note: _getitem releases gil during blosc decompression
+                chunk_._getitem(0, chunklen, in_buffer.data)
+            _factorize_str_helper(chunklen,
+                            allocation_size,
+                            in_buffer_view,
+                            out_buffer_view,
+                            table,
+                            &count,
+                            reverse_keys,
+                            reverse_values,
+                            )
+            with gil:
+                # compress out_buffer into labels
+                labels.append(out_buffer.astype(np.int64))
 
     leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
+    in_buffer_view = carray_.leftover_array \
+                            .view('uint8') \
+                            .reshape(chunklen, carray_.dtype.itemsize)
     if leftover_elements > 0:
-        _factorize_str_helper(leftover_elements,
-                          carray_.dtype.itemsize + 1,
-                          carray_.leftover_array,
-                          out_buffer,
-                          table,
-                          &count,
-                          reverse,
-                          )
+        with nogil:
+            _factorize_str_helper(leftover_elements,
+                                allocation_size,
+                                in_buffer_view,
+                                out_buffer_view,
+                                table,
+                                &count,
+                                reverse_keys,
+                                reverse_values,
+                                )
 
     # compress out_buffer into labels
     labels.append(out_buffer[:leftover_elements].astype(np.int64))
 
     kh_destroy_str(table)
 
+    # construct python dict from vectors
+    reverse = dict(zip(reverse_keys, reverse_values))
+    # free the memory allocated for the strings in the reverse_values list
+    for i in range(reverse_values.size()):
+        free(reverse_values[i])
+
     return labels, reverse
 
 @cython.wraparound(False)
diff --git a/bquery/khash.pxd b/bquery/khash.pxd
index a8fd51a..5e855c7 100644
--- a/bquery/khash.pxd
+++ b/bquery/khash.pxd
@@ -48,9 +48,9 @@ cdef extern from "khash_python.h":
     inline kh_str_t* kh_init_str()
     inline void kh_destroy_str(kh_str_t*)
     inline void kh_clear_str(kh_str_t*)
-    inline khint_t kh_get_str(kh_str_t*, kh_cstr_t)
+    inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil
     inline void kh_resize_str(kh_str_t*, khint_t)
-    inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*)
+    inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil
     inline void kh_del_str(kh_str_t*, khint_t)
 
     bint kh_exist_str(kh_str_t*, khiter_t)
diff --git a/setup.py b/setup.py
index d311871..79915e9 100644
--- a/setup.py
+++ b/setup.py
@@ -137,7 +137,8 @@ def check_import(pkgname, pkgver):
                     library_dirs=lib_dirs,
                     libraries=libs,
                     extra_link_args=LFLAGS,
-                    extra_compile_args=CFLAGS),
+                    extra_compile_args=CFLAGS,
+                    language='c++'),
       ],
       packages=['bquery', 'bquery.tests'],
 )

From 49cbc1fdb39f27d311da0874d03a4fbb973b4832 Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Fri, 6 Mar 2015 17:17:35 +0100
Subject: [PATCH 2/2] removed reverse keys

---
 bquery/ctable_ext.pyx | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index daf2a35..e03b142 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -36,7 +36,6 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range,
                        uint64_t[:] out_buffer,
                        kh_str_t *table,
                        Py_ssize_t * count,
-                       vector[Py_ssize_t] & reverse_keys,
                        vector[char *] & reverse_values,
                        ) nogil:
     cdef:
@@ -68,7 +67,6 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range,
             strcpy(insert, element)
             k = kh_put_str(table, insert, &ret)
             table.vals[k] = idx = count[0]
-            reverse_keys.push_back(count[0])
             reverse_values.push_back(insert)
             count[0] += 1
         out_buffer[i] = idx
@@ -82,7 +80,6 @@ def factorize_str(carray carray_, carray labels=None):
         chunk chunk_
         Py_ssize_t n, i, count, chunklen, leftover_elements, allocation_size, \
                    nchunks
-        vector[Py_ssize_t] reverse_keys
         vector[char *] reverse_values
         dict reverse
         ndarray in_buffer
@@ -126,7 +123,6 @@ def factorize_str(carray carray_, carray labels=None):
                             out_buffer_view,
                             table,
                             &count,
-                            reverse_keys,
                             reverse_values,
                             )
             with gil:
@@ -145,7 +141,6 @@ def factorize_str(carray carray_, carray labels=None):
                                 out_buffer_view,
                                 table,
                                 &count,
-                                reverse_keys,
                                 reverse_values,
                                 )
 
@@ -154,10 +149,10 @@ def factorize_str(carray carray_, carray labels=None):
 
     kh_destroy_str(table)
 
-    # construct python dict from vectors
-    reverse = dict(zip(reverse_keys, reverse_values))
+    # construct python dict from vectors and
     # free the memory allocated for the strings in the reverse_values list
     for i in range(reverse_values.size()):
+        reverse[i] = reverse_values[i]
         free(reverse_values[i])
 
     return labels, reverse