From e20a94ddab478c8698a5fb534f2949eb80d22446 Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 6 Mar 2015 11:34:55 +0100 Subject: [PATCH 1/2] performance improvements to _factorize_str_helper Speedup ca. x1.5 cf. master branch on my machine with compressed bcolz Approximate contributions: - 1/3: direct indexing into arrays using typed memoryviews - 1/3: substitution of reverse dict with std::vector objects - 1/3: use of nested `with nogil`, `with gil` construct --- bquery/ctable_ext.pyx | 107 +++++++++++++++++++++++++++++------------- bquery/khash.pxd | 4 +- setup.py | 3 +- 3 files changed, 78 insertions(+), 36 deletions(-) diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx index 1e4becb..daf2a35 100644 --- a/bquery/ctable_ext.pyx +++ b/bquery/ctable_ext.pyx @@ -1,8 +1,9 @@ import numpy as np import cython -from numpy cimport ndarray, dtype, npy_intp, npy_int32, npy_uint64, npy_int64, npy_float64 -from libc.stdlib cimport malloc -from libc.string cimport strcpy +from numpy cimport ndarray, dtype, npy_intp, npy_int32, npy_uint64, npy_int64, npy_float64, uint64_t +from libc.stdlib cimport malloc, calloc, free +from libc.string cimport strcpy, memcpy +from libcpp.vector cimport vector from khash cimport * from bcolz.carray_ext cimport carray, chunk @@ -31,12 +32,13 @@ DEF _SORTED_COUNT_DISTINCT = 4 @cython.boundscheck(False) cdef void _factorize_str_helper(Py_ssize_t iter_range, Py_ssize_t allocation_size, - ndarray in_buffer, - ndarray[npy_uint64] out_buffer, + char[:, :] in_buffer, + uint64_t[:] out_buffer, kh_str_t *table, Py_ssize_t * count, - dict reverse, - ): + vector[Py_ssize_t] & reverse_keys, + vector[char *] & reverse_values, + ) nogil: cdef: Py_ssize_t i, idx int ret @@ -45,10 +47,16 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range, khiter_t k ret = 0 + # allocate enough memory to hold the string element, add one for the + # null byte that marks the end of the string. + # TODO: understand why zero-filling is necessary. Without zero-filling + # the buffer, duplicate keys occur in the reverse dict + element = calloc(allocation_size, sizeof(char)) for i in range(iter_range): - # TODO: Consider indexing directly into the array for efficiency - element = in_buffer[i] + # strings are stored without null termination in ndarrays: need a + # buffer to append null termination to use usual string algorithms + memcpy(element, &(in_buffer[i, 0]), in_buffer.shape[1]) k = kh_get_str(table, element) if k != table.n_buckets: idx = table.vals[k] @@ -60,24 +68,33 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range, strcpy(insert, element) k = kh_put_str(table, insert, &ret) table.vals[k] = idx = count[0] - reverse[count[0]] = element + reverse_keys.push_back(count[0]) + reverse_values.push_back(insert) count[0] += 1 out_buffer[i] = idx + free(element) + @cython.wraparound(False) @cython.boundscheck(False) def factorize_str(carray carray_, carray labels=None): cdef: chunk chunk_ - Py_ssize_t n, i, count, chunklen, leftover_elements + Py_ssize_t n, i, count, chunklen, leftover_elements, allocation_size, \ + nchunks + vector[Py_ssize_t] reverse_keys + vector[char *] reverse_values dict reverse ndarray in_buffer + char[:, :] in_buffer_view ndarray[npy_uint64] out_buffer + uint64_t[:] out_buffer_view kh_str_t *table count = 0 ret = 0 reverse = {} + nchunks = carray_.nchunks n = len(carray_) chunklen = carray_.chunklen @@ -85,40 +102,64 @@ def factorize_str(carray carray_, carray labels=None): labels = carray([], dtype='int64', expectedlen=n) # in-buffer isn't typed, because cython doesn't support string arrays (?) out_buffer = np.empty(chunklen, dtype='uint64') + # initialise cython typed memoryview to allow indexing directly into array + out_buffer_view = out_buffer in_buffer = np.empty(chunklen, dtype=carray_.dtype) table = kh_init_str() - for i in range(carray_.nchunks): - chunk_ = carray_.chunks[i] - # decompress into in_buffer - chunk_._getitem(0, chunklen, in_buffer.data) - _factorize_str_helper(chunklen, - carray_.dtype.itemsize + 1, - in_buffer, - out_buffer, - table, - &count, - reverse, - ) - # compress out_buffer into labels - labels.append(out_buffer.astype(np.int64)) + # the uint8 view is a workaround to allow the definition of a reshaped + # cython memoryview on string elements contained in a ndarray. + # This allows convenient indexing in the form var[element no, string pos] + in_buffer_view = in_buffer.view('uint8') \ + .reshape(chunklen, carray_.dtype.itemsize) + allocation_size = in_buffer_view.shape[1] + 1 + with nogil: + for i in xrange(nchunks): + with gil: + chunk_ = carray_.chunks[i] + # decompress into in_buffer + # note: _getitem releases gil during blosc decompression + chunk_._getitem(0, chunklen, in_buffer.data) + _factorize_str_helper(chunklen, + allocation_size, + in_buffer_view, + out_buffer_view, + table, + &count, + reverse_keys, + reverse_values, + ) + with gil: + # compress out_buffer into labels + labels.append(out_buffer.astype(np.int64)) leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) + in_buffer_view = carray_.leftover_array \ + .view('uint8') \ + .reshape(chunklen, carray_.dtype.itemsize) if leftover_elements > 0: - _factorize_str_helper(leftover_elements, - carray_.dtype.itemsize + 1, - carray_.leftover_array, - out_buffer, - table, - &count, - reverse, - ) + with nogil: + _factorize_str_helper(leftover_elements, + allocation_size, + in_buffer_view, + out_buffer_view, + table, + &count, + reverse_keys, + reverse_values, + ) # compress out_buffer into labels labels.append(out_buffer[:leftover_elements].astype(np.int64)) kh_destroy_str(table) + # construct python dict from vectors + reverse = dict(zip(reverse_keys, reverse_values)) + # free the memory allocated for the strings in the reverse_values list + for i in range(reverse_values.size()): + free(reverse_values[i]) + return labels, reverse @cython.wraparound(False) diff --git a/bquery/khash.pxd b/bquery/khash.pxd index a8fd51a..5e855c7 100644 --- a/bquery/khash.pxd +++ b/bquery/khash.pxd @@ -48,9 +48,9 @@ cdef extern from "khash_python.h": inline kh_str_t* kh_init_str() inline void kh_destroy_str(kh_str_t*) inline void kh_clear_str(kh_str_t*) - inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) + inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil inline void kh_resize_str(kh_str_t*, khint_t) - inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) + inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil inline void kh_del_str(kh_str_t*, khint_t) bint kh_exist_str(kh_str_t*, khiter_t) diff --git a/setup.py b/setup.py index d311871..79915e9 100644 --- a/setup.py +++ b/setup.py @@ -137,7 +137,8 @@ def check_import(pkgname, pkgver): library_dirs=lib_dirs, libraries=libs, extra_link_args=LFLAGS, - extra_compile_args=CFLAGS), + extra_compile_args=CFLAGS, + language='c++'), ], packages=['bquery', 'bquery.tests'], ) From 49cbc1fdb39f27d311da0874d03a4fbb973b4832 Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 6 Mar 2015 17:17:35 +0100 Subject: [PATCH 2/2] removed reverse keys --- bquery/ctable_ext.pyx | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx index daf2a35..e03b142 100644 --- a/bquery/ctable_ext.pyx +++ b/bquery/ctable_ext.pyx @@ -36,7 +36,6 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range, uint64_t[:] out_buffer, kh_str_t *table, Py_ssize_t * count, - vector[Py_ssize_t] & reverse_keys, vector[char *] & reverse_values, ) nogil: cdef: @@ -68,7 +67,6 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range, strcpy(insert, element) k = kh_put_str(table, insert, &ret) table.vals[k] = idx = count[0] - reverse_keys.push_back(count[0]) reverse_values.push_back(insert) count[0] += 1 out_buffer[i] = idx @@ -82,7 +80,6 @@ def factorize_str(carray carray_, carray labels=None): chunk chunk_ Py_ssize_t n, i, count, chunklen, leftover_elements, allocation_size, \ nchunks - vector[Py_ssize_t] reverse_keys vector[char *] reverse_values dict reverse ndarray in_buffer @@ -126,7 +123,6 @@ def factorize_str(carray carray_, carray labels=None): out_buffer_view, table, &count, - reverse_keys, reverse_values, ) with gil: @@ -145,7 +141,6 @@ def factorize_str(carray carray_, carray labels=None): out_buffer_view, table, &count, - reverse_keys, reverse_values, ) @@ -154,10 +149,10 @@ def factorize_str(carray carray_, carray labels=None): kh_destroy_str(table) - # construct python dict from vectors - reverse = dict(zip(reverse_keys, reverse_values)) + # construct python dict from vectors and # free the memory allocated for the strings in the reverse_values list for i in range(reverse_values.size()): + reverse[i] = reverse_values[i] free(reverse_values[i]) return labels, reverse