diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index 1e4becb..e03b142 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -1,8 +1,9 @@
 import numpy as np
 import cython
-from numpy cimport ndarray, dtype, npy_intp, npy_int32, npy_uint64, npy_int64, npy_float64
-from libc.stdlib cimport malloc
-from libc.string cimport strcpy
+from numpy cimport ndarray, dtype, npy_intp, npy_int32, npy_uint64, npy_int64, npy_float64, uint64_t
+from libc.stdlib cimport malloc, calloc, free
+from libc.string cimport strcpy, memcpy
+from libcpp.vector cimport vector
 from khash cimport *
 from bcolz.carray_ext cimport carray, chunk
 
@@ -31,12 +32,12 @@ DEF _SORTED_COUNT_DISTINCT = 4
 @cython.boundscheck(False)
 cdef void _factorize_str_helper(Py_ssize_t iter_range,
                        Py_ssize_t allocation_size,
-                       ndarray in_buffer,
-                       ndarray[npy_uint64] out_buffer,
+                       char[:, :] in_buffer,
+                       uint64_t[:] out_buffer,
                        kh_str_t *table,
                        Py_ssize_t * count,
-                       dict reverse,
-                       ):
+                       vector[char *] & reverse_values,
+                       ) nogil:
     cdef:
         Py_ssize_t i, idx
         int ret
@@ -45,10 +46,16 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range,
         khiter_t k
 
     ret = 0
+    # allocate enough memory to hold the string element, add one for the
+    # null byte that marks the end of the string.
+    # TODO: understand why zero-filling is necessary. Without zero-filling
+    # the buffer, duplicate keys occur in the reverse dict
+    element = <char *>calloc(allocation_size, sizeof(char))
 
     for i in range(iter_range):
-        # TODO: Consider indexing directly into the array for efficiency
-        element = in_buffer[i]
+        # strings are stored without null termination in ndarrays: need a 
+        # buffer to append null termination to use usual string algorithms
+        memcpy(element, &(in_buffer[i, 0]), in_buffer.shape[1])
         k = kh_get_str(table, element)
         if k != table.n_buckets:
             idx = table.vals[k]
@@ -60,24 +67,31 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range,
             strcpy(insert, element)
             k = kh_put_str(table, insert, &ret)
             table.vals[k] = idx = count[0]
-            reverse[count[0]] = element
+            reverse_values.push_back(insert)
             count[0] += 1
         out_buffer[i] = idx
 
+    free(element)
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def factorize_str(carray carray_, carray labels=None):
     cdef:
         chunk chunk_
-        Py_ssize_t n, i, count, chunklen, leftover_elements
+        Py_ssize_t n, i, count, chunklen, leftover_elements, allocation_size, \
+                   nchunks
+        vector[char *] reverse_values
         dict reverse
         ndarray in_buffer
+        char[:, :] in_buffer_view
         ndarray[npy_uint64] out_buffer
+        uint64_t[:] out_buffer_view
         kh_str_t *table
 
     count = 0
     ret = 0
     reverse = {}
+    nchunks = carray_.nchunks
 
     n = len(carray_)
     chunklen = carray_.chunklen
@@ -85,40 +99,62 @@ def factorize_str(carray carray_, carray labels=None):
         labels = carray([], dtype='int64', expectedlen=n)
     # in-buffer isn't typed, because cython doesn't support string arrays (?)
     out_buffer = np.empty(chunklen, dtype='uint64')
+    # initialise cython typed memoryview to allow indexing directly into array
+    out_buffer_view = out_buffer
     in_buffer = np.empty(chunklen, dtype=carray_.dtype)
     table = kh_init_str()
 
-    for i in range(carray_.nchunks):
-        chunk_ = carray_.chunks[i]
-        # decompress into in_buffer
-        chunk_._getitem(0, chunklen, in_buffer.data)
-        _factorize_str_helper(chunklen,
-                        carray_.dtype.itemsize + 1,
-                        in_buffer,
-                        out_buffer,
-                        table,
-                        &count,
-                        reverse,
-                        )
-        # compress out_buffer into labels
-        labels.append(out_buffer.astype(np.int64))
+    # the uint8 view is a workaround to allow the definition of a reshaped 
+    # cython memoryview on string elements contained in a ndarray. 
+    # This allows convenient indexing in the form var[element no, string pos]
+    in_buffer_view = in_buffer.view('uint8') \
+                              .reshape(chunklen, carray_.dtype.itemsize)
+    allocation_size = in_buffer_view.shape[1] + 1
+    with nogil:
+        for i in xrange(nchunks):
+            with gil:
+                chunk_ = carray_.chunks[i]
+                # decompress into in_buffer
+                # note: _getitem releases gil during blosc decompression
+                chunk_._getitem(0, chunklen, in_buffer.data)
+            _factorize_str_helper(chunklen,
+                            allocation_size,
+                            in_buffer_view,
+                            out_buffer_view,
+                            table,
+                            &count,
+                            reverse_values,
+                            )
+            with gil:
+                # compress out_buffer into labels
+                labels.append(out_buffer.astype(np.int64))
 
     leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
+    in_buffer_view = carray_.leftover_array \
+                            .view('uint8') \
+                            .reshape(chunklen, carray_.dtype.itemsize)
     if leftover_elements > 0:
-        _factorize_str_helper(leftover_elements,
-                          carray_.dtype.itemsize + 1,
-                          carray_.leftover_array,
-                          out_buffer,
-                          table,
-                          &count,
-                          reverse,
-                          )
+        with nogil:
+            _factorize_str_helper(leftover_elements,
+                                allocation_size,
+                                in_buffer_view,
+                                out_buffer_view,
+                                table,
+                                &count,
+                                reverse_values,
+                                )
 
     # compress out_buffer into labels
     labels.append(out_buffer[:leftover_elements].astype(np.int64))
 
     kh_destroy_str(table)
 
+    # construct python dict from vectors and
+    # free the memory allocated for the strings in the reverse_values list
+    for i in range(reverse_values.size()):
+        reverse[i] = reverse_values[i]
+        free(reverse_values[i])
+
     return labels, reverse
 
 @cython.wraparound(False)
diff --git a/bquery/khash.pxd b/bquery/khash.pxd
index a8fd51a..5e855c7 100644
--- a/bquery/khash.pxd
+++ b/bquery/khash.pxd
@@ -48,9 +48,9 @@ cdef extern from "khash_python.h":
     inline kh_str_t* kh_init_str()
     inline void kh_destroy_str(kh_str_t*)
     inline void kh_clear_str(kh_str_t*)
-    inline khint_t kh_get_str(kh_str_t*, kh_cstr_t)
+    inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil
     inline void kh_resize_str(kh_str_t*, khint_t)
-    inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*)
+    inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil
     inline void kh_del_str(kh_str_t*, khint_t)
 
     bint kh_exist_str(kh_str_t*, khiter_t)
diff --git a/setup.py b/setup.py
index d311871..79915e9 100644
--- a/setup.py
+++ b/setup.py
@@ -137,7 +137,8 @@ def check_import(pkgname, pkgver):
                     library_dirs=lib_dirs,
                     libraries=libs,
                     extra_link_args=LFLAGS,
-                    extra_compile_args=CFLAGS),
+                    extra_compile_args=CFLAGS,
+                    language='c++'),
       ],
       packages=['bquery', 'bquery.tests'],
 )