From b16a083ad513a203a53b90784c8378f0ca2e2e56 Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <guilhermeleobas@gmail.com>
Date: Sat, 30 Jul 2022 02:17:35 -0300
Subject: [PATCH 1/3] Numba target extension

---
 rbc/externals/__init__.py              |  14 +-
 rbc/heavydb/__init__.py                |   1 +
 rbc/heavydb/heavydb_compiler.py        | 352 +++++++++++++++++++++++++
 rbc/heavydb/mathimpl.py                |  70 ++++-
 rbc/heavydb/remoteheavydb.py           |   4 +-
 rbc/irtools.py                         | 319 ++++++++--------------
 rbc/remotejit.py                       |   1 +
 rbc/tests/heavydb/test_column_basic.py |  22 ++
 rbc/tests/heavydb/test_heavydb.py      |   8 +-
 rbc/tests/heavydb/test_math.py         |   2 +-
 rbc/tests/test_externals_libdevice.py  |   9 +-
 utils/client_ssh_tunnel.conf           |   2 +-
 12 files changed, 572 insertions(+), 232 deletions(-)
 create mode 100644 rbc/heavydb/heavydb_compiler.py

diff --git a/rbc/externals/__init__.py b/rbc/externals/__init__.py
index 13a8edad..98f126b8 100644
--- a/rbc/externals/__init__.py
+++ b/rbc/externals/__init__.py
@@ -4,11 +4,15 @@
 
 
 def gen_codegen(fn_name):
-    def codegen(context, builder, sig, args):
-        # Need to retrieve the function name again
-        fndesc = funcdesc.ExternalFunctionDescriptor(fn_name, sig.return_type, sig.args)
-        func = context.declare_external_function(builder.module, fndesc)
-        return builder.call(func, args)
+    if fn_name.startswith('llvm.'):
+        def codegen(context, builder, sig, args):
+            func = builder.module.declare_intrinsic(fn_name, [a.type for a in args])
+            return builder.call(func, args)
+    else:
+        def codegen(context, builder, sig, args):
+            fndesc = funcdesc.ExternalFunctionDescriptor(fn_name, sig.return_type, sig.args)
+            func = context.declare_external_function(builder.module, fndesc)
+            return builder.call(func, args)
 
     return codegen
 
diff --git a/rbc/heavydb/__init__.py b/rbc/heavydb/__init__.py
index 2ebbba4f..ec66d4ca 100644
--- a/rbc/heavydb/__init__.py
+++ b/rbc/heavydb/__init__.py
@@ -13,6 +13,7 @@
 from .day_time_interval import *  # noqa: F401, F403
 from .year_month_time_interval import *  # noqa: F401, F403
 from .remoteheavydb import *  # noqa: F401, F403
+from .heavydb_compiler import *  # noqa: F401, F403
 
 from . import mathimpl as math  # noqa: F401
 from . import npyimpl as np  # noqa: F401
diff --git a/rbc/heavydb/heavydb_compiler.py b/rbc/heavydb/heavydb_compiler.py
new file mode 100644
index 00000000..3611d3ef
--- /dev/null
+++ b/rbc/heavydb/heavydb_compiler.py
@@ -0,0 +1,352 @@
+from contextlib import contextmanager
+import llvmlite.binding as llvm
+from rbc.targetinfo import TargetInfo
+from numba.np import ufunc_db
+from numba import _dynfunc
+from numba.core import (
+    codegen, compiler_lock, typing,
+    base, cpu, utils, descriptors,
+    dispatcher, callconv, imputils,
+    options,)
+from numba.core.target_extension import (
+    Generic,
+    target_registry,
+    dispatcher_registry,
+)
+
+
+class HeavyDB_CPU(Generic):
+    """Mark the target as HeavyDB CPU
+    """
+
+
+class HeavyDB_GPU(Generic):
+    """Mark the target as HeavyDB GPU
+    """
+
+
+target_registry['heavydb_cpu'] = HeavyDB_CPU
+target_registry['heavydb_gpu'] = HeavyDB_GPU
+
+heavydb_cpu_registry = imputils.Registry(name='heavydb_cpu_registry')
+heavydb_gpu_registry = imputils.Registry(name='heavydb_gpu_registry')
+
+
+class _NestedContext(object):
+    _typing_context = None
+    _target_context = None
+
+    @contextmanager
+    def nested(self, typing_context, target_context):
+        old_nested = self._typing_context, self._target_context
+        try:
+            self._typing_context = typing_context
+            self._target_context = target_context
+            yield
+        finally:
+            self._typing_context, self._target_context = old_nested
+
+
+_options_mixin = options.include_default_options(
+    "no_rewrites",
+    "no_cpython_wrapper",
+    "no_cfunc_wrapper",
+    "fastmath",
+    "inline",
+    "boundscheck",
+    "nopython",
+    # Add "target_backend" as a accepted option for the CPU in @jit(...)
+    "target_backend",
+)
+
+
+class HeavyDBTargetOptions(_options_mixin, options.TargetOptions):
+    def finalize(self, flags, options):
+        flags.enable_pyobject = False
+        flags.enable_looplift = False
+        flags.nrt = False
+        flags.debuginfo = False
+        flags.boundscheck = False
+        flags.enable_pyobject_looplift = False
+        flags.no_rewrites = True
+        flags.auto_parallel = cpu.ParallelOptions(False)
+        flags.inherit_if_not_set("fastmath")
+        flags.inherit_if_not_set("error_model", default="python")
+        # Add "target_backend" as a option that inherits from the caller
+        flags.inherit_if_not_set("target_backend")
+
+
+class HeavyDBTarget(descriptors.TargetDescriptor):
+    options = HeavyDBTargetOptions
+    _nested = _NestedContext()
+
+    @utils.cached_property
+    def _toplevel_target_context(self):
+        # Lazily-initialized top-level target context, for all threads
+        return JITRemoteTargetContext(self.typing_context, self._target_name)
+
+    @utils.cached_property
+    def _toplevel_typing_context(self):
+        # Lazily-initialized top-level typing context, for all threads
+        return JITRemoteTypingContext()
+
+    @property
+    def target_context(self):
+        """
+        The target context for CPU/GPU targets.
+        """
+        nested = self._nested._target_context
+        if nested is not None:
+            return nested
+        else:
+            return self._toplevel_target_context
+
+    @property
+    def typing_context(self):
+        """
+        The typing context for CPU targets.
+        """
+        nested = self._nested._typing_context
+        if nested is not None:
+            return nested
+        else:
+            return self._toplevel_typing_context
+
+    def nested_context(self, typing_context, target_context):
+        """
+        A context manager temporarily replacing the contexts with the
+        given ones, for the current thread of execution.
+        """
+        return self._nested.nested(typing_context, target_context)
+
+
+# Create a target instance
+heavydb_cpu_target = HeavyDBTarget("heavydb_cpu")
+heavydb_gpu_target = HeavyDBTarget("heavydb_gpu")
+
+
+# Declare a dispatcher for the CPU/GPU targets
+class HeavyDBCPUDispatcher(dispatcher.Dispatcher):
+    targetdescr = heavydb_cpu_target
+
+
+class HeavyDBGPUDispatcher(dispatcher.Dispatcher):
+    targetdescr = heavydb_gpu_target
+
+
+# Register a dispatcher for the target, a lot of the code uses this
+# internally to work out what to do RE compilation
+dispatcher_registry[target_registry["heavydb_cpu"]] = HeavyDBCPUDispatcher
+dispatcher_registry[target_registry["heavydb_gpu"]] = HeavyDBGPUDispatcher
+
+
+class JITRemoteCodeLibrary(codegen.JITCodeLibrary):
+    """JITRemoteCodeLibrary was introduce to prevent numba from calling functions
+    that checks if the module is final. See xnd-project/rbc issue #87.
+    """
+
+    def get_pointer_to_function(self, name):
+        """We can return any random number here! This is just to prevent numba from
+        trying to check if the symbol given by "name" is defined in the module.
+        In cases were RBC is calling an external function (i.e. allocate_varlen_buffer)
+        the symbol will not be defined in the module, resulting in an error.
+        """
+        return 0
+
+    def _finalize_specific(self):
+        """Same as codegen.JITCodeLibrary._finalize_specific but without
+        calling _ensure_finalize at the end
+        """
+        self._codegen._scan_and_fix_unresolved_refs(self._final_module)
+
+
+class JITRemoteCodegen(codegen.JITCPUCodegen):
+    _library_class = JITRemoteCodeLibrary
+
+    def _get_host_cpu_name(self):
+        target_info = TargetInfo()
+        return target_info.device_name
+
+    def _get_host_cpu_features(self):
+        target_info = TargetInfo()
+        features = target_info.device_features
+        server_llvm_version = target_info.llvm_version
+        if server_llvm_version is None or target_info.is_gpu:
+            return ''
+        client_llvm_version = llvm.llvm_version_info
+
+        # See https://github.com/xnd-project/rbc/issues/45
+        remove_features = {
+            (11, 8): ['tsxldtrk', 'amx-tile', 'amx-bf16', 'serialize', 'amx-int8',
+                      'avx512vp2intersect', 'tsxldtrk', 'amx-tile', 'amx-bf16',
+                      'serialize', 'amx-int8', 'avx512vp2intersect', 'tsxldtrk',
+                      'amx-tile', 'amx-bf16', 'serialize', 'amx-int8',
+                      'avx512vp2intersect', 'cx8', 'enqcmd', 'avx512bf16'],
+            (11, 10): ['tsxldtrk', 'amx-tile', 'amx-bf16', 'serialize', 'amx-int8'],
+            (9, 8): ['cx8', 'enqcmd', 'avx512bf16'],
+        }.get((server_llvm_version[0], client_llvm_version[0]), [])
+        for f in remove_features:
+            features = features.replace('+' + f, '').replace('-' + f, '')
+        return features
+
+    def _customize_tm_options(self, options):
+        super()._customize_tm_options(options)
+        # fix reloc_model as the base method sets it using local target
+        target_info = TargetInfo()
+        if target_info.arch.startswith('x86'):
+            reloc_model = 'static'
+        else:
+            reloc_model = 'default'
+        options['reloc'] = reloc_model
+
+    def set_env(self, env_name, env):
+        return None
+
+
+class JITRemoteTypingContext(typing.Context):
+    """JITRemote Typing Context
+    """
+
+    def load_additional_registries(self):
+        from . import mathimpl
+        self.install_registry(mathimpl.registry)
+        return super().load_additional_registries()
+
+
+class JITRemoteTargetContext(base.BaseContext):
+    # Whether dynamic globals (CPU runtime addresses) is allowed
+    allow_dynamic_globals = True
+
+    def __init__(self, typing_context, target):
+        if target not in ('heavydb_cpu', 'heavydb_gpu'):
+            raise ValueError(f'Target "{target}" not supported')
+        super().__init__(typing_context, target)
+
+    @compiler_lock.global_compiler_lock
+    def init(self):
+        target_info = TargetInfo()
+        self.address_size = target_info.bits
+        self.is32bit = (self.address_size == 32)
+        self._internal_codegen = JITRemoteCodegen("numba.exec")
+        self._target_data = llvm.create_target_data(target_info.datalayout)
+
+    def refresh(self):
+        if self.target_name == 'heavydb_cpu':
+            registry = heavydb_cpu_registry
+        else:
+            registry = heavydb_gpu_registry
+
+        try:
+            loader = self._registries[registry]
+        except KeyError:
+            loader = imputils.RegistryLoader(registry)
+            self._registries[registry] = loader
+
+        self.install_registry(registry)
+        # Also refresh typing context, since @overload declarations can
+        # affect it.
+        self.typing_context.refresh()
+        super().refresh()
+
+    def load_additional_registries(self):
+        # Add implementations that work via import
+        from numba.cpython import (builtins, charseq, enumimpl, hashing, heapq,  # noqa: F401
+                                   iterators, listobj, numbers, rangeobj,
+                                   setobj, slicing, tupleobj, unicode,)
+
+        self.install_registry(imputils.builtin_registry)
+
+        # uncomment as needed!
+        # from numba.core import optional
+        from numba.np import linalg, polynomial, arraymath, arrayobj  # noqa: F401
+        # from numba.typed import typeddict, dictimpl
+        # from numba.typed import typedlist, listobject
+        # from numba.experimental import jitclass, function_type
+        # from numba.np import npdatetime
+
+        # Add target specific implementations
+        from numba.np import npyimpl
+        from numba.cpython import mathimpl
+        # from numba.cpython import cmathimpl, mathimpl, printimpl, randomimpl
+        # from numba.misc import cffiimpl
+        # from numba.experimental.jitclass.base import ClassBuilder as \
+        #     jitclassimpl
+        # self.install_registry(cmathimpl.registry)
+        # self.install_registry(cffiimpl.registry)
+        self.install_registry(mathimpl.registry)
+        self.install_registry(npyimpl.registry)
+        # self.install_registry(printimpl.registry)
+        # self.install_registry(randomimpl.registry)
+        # self.install_registry(jitclassimpl.class_impl_registry)
+
+    def codegen(self):
+        return self._internal_codegen
+
+    @utils.cached_property
+    def call_conv(self):
+        return callconv.CPUCallConv(self)
+
+    @property
+    def target_data(self):
+        return self._target_data
+
+    def create_cpython_wrapper(self,
+                               library,
+                               fndesc,
+                               env,
+                               call_helper,
+                               release_gil=False):
+        # There's no cpython wrapper on HeavyDB
+        pass
+
+    def create_cfunc_wrapper(self,
+                             library,
+                             fndesc,
+                             env,
+                             call_helper,
+                             release_gil=False):
+        # There's no cfunc wrapper on HeavyDB
+        pass
+
+    def get_executable(self, library, fndesc, env):
+        """
+        Returns
+        -------
+        (cfunc, fnptr)
+
+        - cfunc
+            callable function (Can be None)
+        - fnptr
+            callable function address
+        - env
+            an execution environment (from _dynfunc)
+        """
+        # although we don't use this function, it seems to be required
+        # by some parts of codegen in Numba.
+
+        # Code generation
+        fnptr = library.get_pointer_to_function(
+            fndesc.llvm_cpython_wrapper_name
+        )
+
+        # Note: we avoid reusing the original docstring to avoid encoding
+        # issues on Python 2, see issue #1908
+        doc = "compiled wrapper for %r" % (fndesc.qualname,)
+        cfunc = _dynfunc.make_function(
+            fndesc.lookup_module(),
+            fndesc.qualname.split(".")[-1],
+            doc,
+            fnptr,
+            env,
+            # objects to keepalive with the function
+            (library,),
+        )
+        library.codegen.set_env(self.get_env_name(fndesc), env)
+        return cfunc
+
+    def post_lowering(self, mod, library):
+        pass
+
+    # Overrides
+    def get_ufunc_info(self, ufunc_key):
+        return ufunc_db.get_ufunc_info(ufunc_key)
diff --git a/rbc/heavydb/mathimpl.py b/rbc/heavydb/mathimpl.py
index b508e8b5..3155179f 100644
--- a/rbc/heavydb/mathimpl.py
+++ b/rbc/heavydb/mathimpl.py
@@ -1,9 +1,17 @@
 import math
-from rbc.externals import gen_codegen, dispatch_codegen
-from numba.core.typing.templates import infer_global
-from numba.core.imputils import lower_builtin
-from numba.core.typing.templates import ConcreteTemplate, signature
+from rbc.externals import gen_codegen
+from numba.core.typing.templates import ConcreteTemplate, signature, Registry
 from numba.types import float32, float64, int32, int64, uint64, intp
+from numba.core.intrinsics import INTR_TO_CMATH
+from .heavydb_compiler import heavydb_cpu_registry, heavydb_gpu_registry
+
+
+lower_cpu = heavydb_cpu_registry.lower
+lower_gpu = heavydb_gpu_registry.lower
+
+
+registry = Registry()
+infer_global = registry.register_global
 
 
 # Adding missing cases in Numba
@@ -75,22 +83,31 @@ class Math_converter(ConcreteTemplate):
 binarys = []
 binarys += [("copysign", "copysignf", math.copysign)]
 binarys += [("atan2", "atan2f", math.atan2)]
-binarys += [("pow", "powf", math.pow)]
 binarys += [("fmod", "fmodf", math.fmod)]
 binarys += [("hypot", "hypotf", math.hypot)]
 binarys += [("remainder", "remainderf", math.remainder)]
 
 
 def impl_unary(fname, key, typ):
-    cpu = gen_codegen(fname)
+    if fname in INTR_TO_CMATH.values():
+        # use llvm intrinsics when possible
+        cpu = gen_codegen(f'llvm.{fname}')
+    else:
+        cpu = gen_codegen(fname)
     gpu = gen_codegen(f"__nv_{fname}")
-    lower_builtin(key, typ)(dispatch_codegen(cpu, gpu))
+    lower_cpu(key, typ)(cpu)
+    lower_gpu(key, typ)(gpu)
 
 
 def impl_binary(fname, key, typ):
-    cpu = gen_codegen(fname)
+    if fname in INTR_TO_CMATH.values():
+        # use llvm intrinsics when possible
+        cpu = gen_codegen(f'llvm.{fname}')
+    else:
+        cpu = gen_codegen(fname)
     gpu = gen_codegen(f"__nv_{fname}")
-    lower_builtin(key, typ, typ)(dispatch_codegen(cpu, gpu))
+    lower_cpu(key, typ, typ)(cpu)
+    lower_gpu(key, typ, typ)(gpu)
 
 
 for fname64, fname32, key in unarys:
@@ -105,17 +122,42 @@ def impl_binary(fname, key, typ):
 
 # manual mapping
 def impl_ldexp():
+    # cpu
     ldexp_cpu = gen_codegen('ldexp')
-    ldexp_gpu = gen_codegen('__nv_ldexp')
-
     ldexpf_cpu = gen_codegen('ldexpf')
-    ldexpf_gpu = gen_codegen('__nv_ldexpf')
+    lower_cpu(math.ldexp, float64, int32)(ldexp_cpu)
+    lower_cpu(math.ldexp, float32, int32)(ldexpf_cpu)
 
-    lower_builtin(math.ldexp, float64, int32)(dispatch_codegen(ldexp_cpu, ldexp_gpu))
-    lower_builtin(math.ldexp, float32, int32)(dispatch_codegen(ldexpf_cpu, ldexpf_gpu))
+    # gpu
+    ldexp_gpu = gen_codegen('__nv_ldexp')
+    ldexpf_gpu = gen_codegen('__nv_ldexpf')
+    lower_gpu(math.ldexp, float64, int32)(ldexp_gpu)
+    lower_gpu(math.ldexp, float32, int32)(ldexpf_gpu)
+
+
+def impl_pow():
+    # cpu
+    pow_cpu = gen_codegen('pow')
+    powf_cpu = gen_codegen('powf')
+    lower_cpu(math.pow, float64, float64)(pow_cpu)
+    lower_cpu(math.pow, float32, float32)(powf_cpu)
+    lower_cpu(math.pow, float64, int32)(pow_cpu)
+    lower_cpu(math.pow, float32, int32)(powf_cpu)
+
+    # gpu
+    pow_gpu = gen_codegen('__nv_pow')
+    powf_gpu = gen_codegen('__nv_powf')
+    powi_gpu = gen_codegen('__nv_powi')
+    powif_gpu = gen_codegen('__nv_powif')
+    lower_gpu(math.pow, float64, float64)(pow_gpu)
+    lower_gpu(math.pow, float32, float32)(powf_gpu)
+    lower_gpu(math.pow, float64, int32)(powi_gpu)
+    lower_gpu(math.pow, float32, int32)(powif_gpu)
 
 
 impl_ldexp()
+impl_pow()
+
 
 # CPU only:
 # math.gcd
diff --git a/rbc/heavydb/remoteheavydb.py b/rbc/heavydb/remoteheavydb.py
index 30dfa705..36c31cd5 100644
--- a/rbc/heavydb/remoteheavydb.py
+++ b/rbc/heavydb/remoteheavydb.py
@@ -261,7 +261,7 @@ def is_sizer(t):
 
 
 def get_sizer_enum(t):
-    """Return sizer enum value as defined by the omniscidb server.
+    """Return sizer enum value as defined by the HeavyDB server.
     """
     sizer = t.annotation()['sizer']
     sizer = output_buffer_sizer_map.get(sizer or None, sizer)
@@ -1530,5 +1530,5 @@ def remote_call(self, func, ftype: typesystem.Type, arguments: tuple, hold=False
 
 
 class RemoteOmnisci(RemoteHeavyDB):
-    """Omnisci - the previous brand of HeavyAI
+    """HeavyDB - the previous brand of HeavyAI
     """
diff --git a/rbc/irtools.py b/rbc/irtools.py
index 371843e6..90f20152 100644
--- a/rbc/irtools.py
+++ b/rbc/irtools.py
@@ -3,7 +3,6 @@
 
 import re
 import warnings
-from contextlib import contextmanager
 from collections import defaultdict
 from llvmlite import ir
 import llvmlite.binding as llvm
@@ -11,12 +10,33 @@
 from .errors import UnsupportedError
 from . import libfuncs
 from rbc.externals import stdio
-from numba.core import codegen, cpu, compiler_lock, \
+from numba.core import cpu, \
     registry, typing, compiler, sigutils, cgutils, \
-    extending, imputils
+    extending, target_extension, retarget, dispatcher
+from numba import njit
 from numba.core import errors as nb_errors
 
 
+class Retarget(retarget.BasicRetarget):
+
+    def __init__(self, target_name):
+        self.target_name = target_name
+        super().__init__()
+
+    @property
+    def output_target(self):
+        return self.target_name
+
+    def compile_retarget(self, cpu_disp):
+        kernel = njit(_target=self.target_name)(cpu_disp.py_func)
+        return kernel
+
+
+def switch_target(target_name):
+    tc = dispatcher.TargetConfigurationStack
+    return tc.switch_target(Retarget(target_name))
+
+
 int32_t = ir.IntType(32)
 int1_t = ir.IntType(1)
 
@@ -64,132 +84,6 @@ def get_called_functions(library, funcname=None):
 
 # ---------------------------------------------------------------------------
 
-class JITRemoteCodeLibrary(codegen.JITCodeLibrary):
-    """JITRemoteCodeLibrary was introduce to prevent numba from calling functions
-    that checks if the module is final. See xnd-project/rbc issue #87.
-    """
-
-    def get_pointer_to_function(self, name):
-        """We can return any random number here! This is just to prevent numba from
-        trying to check if the symbol given by "name" is defined in the module.
-        In cases were RBC is calling an external function (i.e. allocate_varlen_buffer)
-        the symbol will not be defined in the module, resulting in an error.
-        """
-        return 0
-
-    def _finalize_specific(self):
-        """Same as codegen.JITCodeLibrary._finalize_specific but without
-        calling _ensure_finalize at the end
-        """
-        self._codegen._scan_and_fix_unresolved_refs(self._final_module)
-
-
-class JITRemoteCodegen(codegen.JITCPUCodegen):
-    _library_class = JITRemoteCodeLibrary
-
-    def _get_host_cpu_name(self):
-        target_info = TargetInfo()
-        return target_info.device_name
-
-    def _get_host_cpu_features(self):
-        target_info = TargetInfo()
-        features = target_info.device_features
-        server_llvm_version = target_info.llvm_version
-        if server_llvm_version is None or target_info.is_gpu:
-            return ''
-        client_llvm_version = llvm.llvm_version_info
-
-        # See https://github.com/xnd-project/rbc/issues/45
-        remove_features = {
-            (12, 12): [], (11, 11): [], (10, 10): [], (9, 9): [], (8, 8): [],
-            (11, 8): ['tsxldtrk', 'amx-tile', 'amx-bf16', 'serialize', 'amx-int8',
-                      'avx512vp2intersect', 'tsxldtrk', 'amx-tile', 'amx-bf16',
-                      'serialize', 'amx-int8', 'avx512vp2intersect', 'tsxldtrk',
-                      'amx-tile', 'amx-bf16', 'serialize', 'amx-int8',
-                      'avx512vp2intersect', 'cx8', 'enqcmd', 'avx512bf16'],
-            (11, 10): ['tsxldtrk', 'amx-tile', 'amx-bf16', 'serialize', 'amx-int8'],
-            (9, 11): ['sse2', 'cx16', 'sahf', 'tbm', 'avx512ifma', 'sha',
-                      'gfni', 'fma4', 'vpclmulqdq', 'prfchw', 'bmi2', 'cldemote',
-                      'fsgsbase', 'ptwrite', 'xsavec', 'popcnt', 'mpx',
-                      'avx512bitalg', 'movdiri', 'xsaves', 'avx512er',
-                      'avx512vnni', 'avx512vpopcntdq', 'pconfig', 'clwb',
-                      'avx512f', 'clzero', 'pku', 'mmx', 'lwp', 'rdpid', 'xop',
-                      'rdseed', 'waitpkg', 'movdir64b', 'sse4a', 'avx512bw',
-                      'clflushopt', 'xsave', 'avx512vbmi2', '64bit', 'avx512vl',
-                      'invpcid', 'avx512cd', 'avx', 'vaes', 'cx8', 'fma', 'rtm',
-                      'bmi', 'enqcmd', 'rdrnd', 'mwaitx', 'sse4.1', 'sse4.2', 'avx2',
-                      'fxsr', 'wbnoinvd', 'sse', 'lzcnt', 'pclmul', 'prefetchwt1',
-                      'f16c', 'ssse3', 'sgx', 'shstk', 'cmov', 'avx512vbmi',
-                      'avx512bf16', 'movbe', 'xsaveopt', 'avx512dq', 'adx',
-                      'avx512pf', 'sse3'],
-            (9, 8): ['cx8', 'enqcmd', 'avx512bf16'],
-        }.get((server_llvm_version[0], client_llvm_version[0]), None)
-        if remove_features is None:
-            warnings.warn(
-                f'{type(self).__name__}._get_host_cpu_features: `remove_features` dictionary'
-                ' requires an update: detected different LLVM versions in server '
-                f'{server_llvm_version} and client {client_llvm_version}.'
-                f' CPU features: {features}.')
-        else:
-            features += ','
-            for f in remove_features:
-                features = features.replace('+' + f + ',', '').replace('-' + f + ',', '')
-            features.rstrip(',')
-        return features
-
-    def _customize_tm_options(self, options):
-        super()._customize_tm_options(options)
-        # fix reloc_model as the base method sets it using local target
-        target_info = TargetInfo()
-        if target_info.arch.startswith('x86'):
-            reloc_model = 'static'
-        else:
-            reloc_model = 'default'
-        options['reloc'] = reloc_model
-
-    def set_env(self, env_name, env):
-        return None
-
-
-class JITRemoteTypingContext(typing.Context):
-    def load_additional_registries(self):
-        self.install_registry(typing.templates.builtin_registry)
-        super().load_additional_registries()
-
-
-class JITRemoteTargetContext(cpu.CPUContext):
-
-    @compiler_lock.global_compiler_lock
-    def init(self):
-        target_info = TargetInfo()
-        self.address_size = target_info.bits
-        self.is32bit = (self.address_size == 32)
-        self._internal_codegen = JITRemoteCodegen("numba.exec")
-
-    def load_additional_registries(self):
-        self.install_registry(imputils.builtin_registry)
-        super().load_additional_registries()
-
-    def get_executable(self, library, fndesc, env):
-        return None
-
-    def post_lowering(self, mod, library):
-        pass
-
-
-# ---------------------------------------------------------------------------
-# Code generation methods
-
-
-@contextmanager
-def replace_numba_internals_hack():
-    # Hackish solution to prevent numba from calling _ensure_finalize. See issue #87
-    _internal_codegen_bkp = registry.cpu_target.target_context._internal_codegen
-    registry.cpu_target.target_context._internal_codegen = JITRemoteCodegen("numba.exec")
-    yield
-    registry.cpu_target.target_context._internal_codegen = _internal_codegen_bkp
-
-
 def make_wrapper(fname, atypes, rtype, cres, target: TargetInfo, verbose=False):
     """Make wrapper function to numba compile result.
 
@@ -268,7 +162,7 @@ def make_wrapper(fname, atypes, rtype, cres, target: TargetInfo, verbose=False):
 
 
 def compile_instance(func, sig,
-                     target: TargetInfo,
+                     target_info: TargetInfo,
                      typing_context,
                      target_context,
                      pipeline_class,
@@ -309,7 +203,7 @@ def compile_instance(func, sig,
     result = get_called_functions(cres.library, cres.fndesc.llvm_func_name)
 
     for f in result['declarations']:
-        if target.supports(f):
+        if target_info.supports(f):
             continue
         warnings.warn(f'Skipping {fname} that uses undefined function `{f}`')
         return
@@ -317,18 +211,18 @@ def compile_instance(func, sig,
     nvvmlib = libfuncs.Library.get('nvvm')
     llvmlib = libfuncs.Library.get('llvm')
     for f in result['intrinsics']:
-        if target.is_gpu:
+        if target_info.is_gpu:
             if f in nvvmlib:
                 continue
 
-        if target.is_cpu:
+        if target_info.is_cpu:
             if f in llvmlib:
                 continue
 
         warnings.warn(f'Skipping {fname} that uses unsupported intrinsic `{f}`')
         return
 
-    make_wrapper(fname, args, return_type, cres, target, verbose=debug)
+    make_wrapper(fname, args, return_type, cres, target_info, verbose=debug)
 
     main_module = main_library._final_module
     for lib in result['libraries']:
@@ -373,83 +267,100 @@ def compile_to_LLVM(functions_and_signatures,
       LLVM module instance. To get the IR string, use `str(module)`.
 
     """
-    target_desc = registry.cpu_target
-
-    typing_context = JITRemoteTypingContext()
-    target_context = JITRemoteTargetContext(typing_context)
+    # avoid circula import error
+    # * remotejit imports irtools
+    # * irtools import heavydb
+    # * heavydb import remotejit
+    from rbc.heavydb import JITRemoteTypingContext, JITRemoteTargetContext, \
+        heavydb_cpu_target, heavydb_gpu_target
+
+    device = target_info.name
+    software = target_info.software[0]
+
+    if software == 'HeavyDB':
+        target_name = f'heavydb_{device}'
+        target_desc = heavydb_cpu_target if device == 'cpu' else heavydb_gpu_target
+        typing_context = JITRemoteTypingContext()
+        target_context = JITRemoteTargetContext(typing_context, target_name)
+    else:
+        target_name = 'cpu'
+        target_desc = registry.cpu_target
+        typing_context = typing.Context()
+        target_context = cpu.CPUContext(typing_context, target_name)
 
     # Bring over Array overloads (a hack):
     target_context._defns = target_desc.target_context._defns
 
-    with replace_numba_internals_hack():
-        codegen = target_context.codegen()
-        main_library = codegen.create_library('rbc.irtools.compile_to_IR')
-        main_module = main_library._final_module
-
-        if user_defined_llvm_ir is not None:
-            if isinstance(user_defined_llvm_ir, str):
-                user_defined_llvm_ir = llvm.parse_assembly(user_defined_llvm_ir)
-            assert isinstance(user_defined_llvm_ir, llvm.ModuleRef)
-            main_module.link_in(user_defined_llvm_ir, preserve=True)
-
-        succesful_fids = []
-        function_names = []
-        for func, signatures in functions_and_signatures:
-            for fid, sig in signatures.items():
-                fname = compile_instance(func, sig, target_info, typing_context,
-                                         target_context, pipeline_class,
-                                         main_library,
-                                         debug=debug)
-                if fname is not None:
-                    succesful_fids.append(fid)
-                    function_names.append(fname)
-
-        add_metadata_flag(main_library,
-                          pass_column_arguments_by_value=0,
-                          manage_memory_buffer=1)
-        main_library._optimize_final_module()
+    codegen = target_context.codegen()
+    main_library = codegen.create_library(f'rbc.irtools.compile_to_IR_{software}_{device}')
+    main_module = main_library._final_module
 
-        # Remove unused defined functions and declarations
-        used_symbols = defaultdict(set)
-        for fname in function_names:
-            for k, v in get_called_functions(main_library, fname).items():
-                used_symbols[k].update(v)
+    if user_defined_llvm_ir is not None:
+        if isinstance(user_defined_llvm_ir, str):
+            user_defined_llvm_ir = llvm.parse_assembly(user_defined_llvm_ir)
+        assert isinstance(user_defined_llvm_ir, llvm.ModuleRef)
+        main_module.link_in(user_defined_llvm_ir, preserve=True)
+
+    succesful_fids = []
+    function_names = []
+    for func, signatures in functions_and_signatures:
+        for fid, sig in signatures.items():
+            with switch_target(target_name):
+                with target_extension.target_override(target_name):
+                    fname = compile_instance(func, sig, target_info, typing_context,
+                                             target_context, pipeline_class,
+                                             main_library,
+                                             debug=debug)
+                    if fname is not None:
+                        succesful_fids.append(fid)
+                        function_names.append(fname)
+
+    add_metadata_flag(main_library,
+                      pass_column_arguments_by_value=0,
+                      manage_memory_buffer=1)
+    main_library._optimize_final_module()
+
+    # Remove unused defined functions and declarations
+    used_symbols = defaultdict(set)
+    for fname in function_names:
+        for k, v in get_called_functions(main_library, fname).items():
+            used_symbols[k].update(v)
+
+    all_symbols = get_called_functions(main_library)
+
+    unused_symbols = defaultdict(set)
+    for k, lst in all_symbols.items():
+        if k == 'libraries':
+            continue
+        for fn in lst:
+            if fn not in used_symbols[k]:
+                unused_symbols[k].add(fn)
+
+    changed = False
+    for f in main_module.functions:
+        fn = f.name
+        if fn.startswith('llvm.'):
+            if f.name in unused_symbols['intrinsics']:
+                f.linkage = llvm.Linkage.external
+                changed = True
+        elif f.is_declaration:
+            if f.name in unused_symbols['declarations']:
+                f.linkage = llvm.Linkage.external
+                changed = True
+        else:
+            if f.name in unused_symbols['defined']:
+                f.linkage = llvm.Linkage.private
+                changed = True
 
-        all_symbols = get_called_functions(main_library)
+    # TODO: determine unused global_variables and struct_types
 
-        unused_symbols = defaultdict(set)
-        for k, lst in all_symbols.items():
-            if k == 'libraries':
-                continue
-            for fn in lst:
-                if fn not in used_symbols[k]:
-                    unused_symbols[k].add(fn)
-
-        changed = False
-        for f in main_module.functions:
-            fn = f.name
-            if fn.startswith('llvm.'):
-                if f.name in unused_symbols['intrinsics']:
-                    f.linkage = llvm.Linkage.external
-                    changed = True
-            elif f.is_declaration:
-                if f.name in unused_symbols['declarations']:
-                    f.linkage = llvm.Linkage.external
-                    changed = True
-            else:
-                if f.name in unused_symbols['defined']:
-                    f.linkage = llvm.Linkage.private
-                    changed = True
-
-        # TODO: determine unused global_variables and struct_types
-
-        if changed:
-            main_library._optimize_final_module()
-
-        main_module.verify()
-        main_library._finalized = True
-        main_module.triple = target_info.triple
-        main_module.data_layout = target_info.datalayout
+    if changed:
+        main_library._optimize_final_module()
+
+    main_module.verify()
+    main_library._finalized = True
+    main_module.triple = target_info.triple
+    main_module.data_layout = target_info.datalayout
 
     return main_module, succesful_fids
 
diff --git a/rbc/remotejit.py b/rbc/remotejit.py
index ee76aec5..4bf08816 100644
--- a/rbc/remotejit.py
+++ b/rbc/remotejit.py
@@ -949,6 +949,7 @@ def targets(self) -> dict:
         target_info = TargetInfo.host()
         target_info.set('has_numba', True)
         target_info.set('has_cpython', True)
+        target_info.set('software', 'remotejit')
         return dict(cpu=target_info.tojson())
 
     @dispatchermethod
diff --git a/rbc/tests/heavydb/test_column_basic.py b/rbc/tests/heavydb/test_column_basic.py
index ddb292ff..299a8a91 100644
--- a/rbc/tests/heavydb/test_column_basic.py
+++ b/rbc/tests/heavydb/test_column_basic.py
@@ -3,6 +3,8 @@
 from collections import defaultdict
 import pytest
 import numpy as np
+import math
+from numba import njit
 
 
 rbc_heavydb = pytest.importorskip('rbc.heavydb')
@@ -822,6 +824,26 @@ def convolve(x, kernel, m, y):
     assert list(result) == expected
 
 
+def test_issue343(heavydb):
+    # Before generating llvm code, the irtools entry point needs
+    # to switch the target context from CPU to GPU, so that functions
+    # are bind to the correct target. In the case below, math.exp
+    # is bind to '@llvm.exp.f64' on CPU and '@__nv_exp' on GPU.
+    if not heavydb.has_cuda:
+        pytest.skip('test requires heavydb build with GPU support')
+
+    @njit
+    def bar(x):
+        return math.exp(x)
+
+    @heavydb('double(double)', devices=['cpu', 'gpu'])
+    def foo(x):
+        return math.exp(x) + bar(x)
+
+    assert '__nv_exp' in str(foo)
+    assert 'llvm.exp.f64' in str(foo)
+
+
 def test_column_dtype(heavydb):
     from numba import types
     table = heavydb.table_name
diff --git a/rbc/tests/heavydb/test_heavydb.py b/rbc/tests/heavydb/test_heavydb.py
index fda80c16..74b10918 100644
--- a/rbc/tests/heavydb/test_heavydb.py
+++ b/rbc/tests/heavydb/test_heavydb.py
@@ -359,7 +359,7 @@ def test_casting(heavydb):
     The following table defines the behavior of applying these UDFs to
     values with different types:
 
-    OmnisciDB version 5.9+
+    HeavyDB version 5.9+
     ----------------------
              | Functions applied to <itype value>
     itype    | i8   | i16  | i32  | i64  | f32  | f64  |
@@ -371,7 +371,7 @@ def test_casting(heavydb):
     float    | FAIL | FAIL | FAIL | FAIL | OK   | OK   |
     double   | FAIL | FAIL | FAIL | FAIL | FAIL | OK   |
 
-    OmnisciDB version 5.8
+    HeavyDB version 5.8
     ----------------------
              | Functions applied to <itype value>
     itype    | i8   | i16  | i32  | i64  | f32  | f64  |
@@ -383,7 +383,7 @@ def test_casting(heavydb):
     float    | FAIL | FAIL | FAIL | FAIL | OK   | OK   |
     double   | FAIL | FAIL | FAIL | FAIL | FAIL | OK   |
 
-    OmnisciDB version 5.7 and older
+    HeavyDB version 5.7 and older
     -------------------------------
              | Functions applied to <itype value>
     itype    | i8   | i16  | i32  | i64  | f32  | f64  |
@@ -838,7 +838,7 @@ def test_reconnect(heavydb):
 
 
 def test_non_admin_user(heavydb):
-    heavydb.require_version((5, 9), 'Requires omniscidb 5.9 or newer')
+    heavydb.require_version((5, 9), 'Requires HeavyDB 5.9 or newer')
 
     user = 'rbc_test_non_admin_user'
     password = 'Xy2kq_3lM'
diff --git a/rbc/tests/heavydb/test_math.py b/rbc/tests/heavydb/test_math.py
index 019c1abb..78ea68a3 100644
--- a/rbc/tests/heavydb/test_math.py
+++ b/rbc/tests/heavydb/test_math.py
@@ -53,7 +53,7 @@ def heavydb():
 
 math_functions = [
     # Number-theoretic and representation functions
-    ('ceil', 'int64(double)'),
+    ('ceil', 'double(double)'),
     ('comb', 'int64(int64, int64)'),
     ('copysign', 'double(double, double)'),
     ('fabs', 'double(double)'),
diff --git a/rbc/tests/test_externals_libdevice.py b/rbc/tests/test_externals_libdevice.py
index 223c6047..71b78459 100644
--- a/rbc/tests/test_externals_libdevice.py
+++ b/rbc/tests/test_externals_libdevice.py
@@ -15,6 +15,9 @@
     funcs.append((fname, str(retty), argtys, has_ptr_arg))
 
 
+fns = {}
+
+
 @pytest.fixture(scope="module")
 def heavydb():
 
@@ -50,6 +53,7 @@ def fn(a, b, c):
 
         fn.__name__ = f"{heavydb.table_name}_{fname[5:]}"
         fn = heavydb(f"{retty}({', '.join(argtypes)})", devices=["gpu"])(fn)
+        fns[fname] = fn
 
     for fname, retty, argtys, has_ptr_arg in funcs:
         if has_ptr_arg:
@@ -84,4 +88,7 @@ def test_externals_libdevice(heavydb, fname, retty, argtys, has_ptr_arg):
         cols = ", ".join(tuple(map(lambda x: cols_dict[x], argtys)))
         query = f"SELECT {func_name}({cols}) FROM {table}"
 
-    _, _ = heavydb.sql_execute(query)
+    _, result = heavydb.sql_execute(query)
+
+    assert fname in str(fns[fname])
+    # to-do: check results
diff --git a/utils/client_ssh_tunnel.conf b/utils/client_ssh_tunnel.conf
index ab32578b..52224310 100644
--- a/utils/client_ssh_tunnel.conf
+++ b/utils/client_ssh_tunnel.conf
@@ -6,7 +6,7 @@
 #   1. Run omnscidb server with ssh port forwarding::
 #
 #      ssh -L 6274:127.0.0.1:16274  <remote host name or IP>
-#      bin/omnisci_server --enable-runtime-udf --enable-table-functions -p 16274 --http-port 16278 --calcite-port 16279
+#      bin/omnisci_server --enable-dev-table-functions --enable-runtime-udf --enable-table-functions -p 16274 --http-port 16278 --calcite-port 16279
 #
 #   2. Relate the omniscidb server to client:
 #

From d6bc48b43209bd42221e073bd296958e18c3cde6 Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <guilhermeleobas@gmail.com>
Date: Mon, 7 Nov 2022 22:41:43 -0300
Subject: [PATCH 2/3] [wip] address some of the reviewer comments

---
 rbc/heavydb/buffer.py                  |   3 +-
 rbc/heavydb/extending.py               |   8 +
 rbc/heavydb/heavydb_compiler.py        |  60 ++++-
 rbc/heavydb/mathimpl.py                | 359 ++++++++++++++-----------
 rbc/heavydb/remoteheavydb.py           |   2 +-
 rbc/tests/__init__.py                  |  31 ++-
 rbc/tests/heavydb/test_array_math.py   |   1 +
 rbc/tests/heavydb/test_column_basic.py |  18 +-
 rbc/tests/heavydb/test_heavydb.py      |  26 +-
 rbc/tests/heavydb/test_math.py         |  56 +---
 utils/client_ssh_tunnel.conf           |   8 +-
 11 files changed, 323 insertions(+), 249 deletions(-)
 create mode 100644 rbc/heavydb/extending.py

diff --git a/rbc/heavydb/buffer.py b/rbc/heavydb/buffer.py
index 40ab747d..bb212b3f 100644
--- a/rbc/heavydb/buffer.py
+++ b/rbc/heavydb/buffer.py
@@ -29,7 +29,8 @@
 import numpy as np
 from rbc import typesystem
 from rbc.targetinfo import TargetInfo
-from numba.core import datamodel, cgutils, extending, types, imputils
+from rbc.heavydb import extending
+from numba.core import datamodel, cgutils, types, imputils
 
 int8_t = ir.IntType(8)
 int32_t = ir.IntType(32)
diff --git a/rbc/heavydb/extending.py b/rbc/heavydb/extending.py
new file mode 100644
index 00000000..7a97b8a0
--- /dev/null
+++ b/rbc/heavydb/extending.py
@@ -0,0 +1,8 @@
+from functools import partial
+from numba.core import extending
+from numba.core.extending import lower_builtin
+
+overload = partial(extending.overload, target='generic')
+overload_method = partial(extending.overload_method, target='generic')
+overload_attribute = partial(extending.overload_attribute, target='generic')
+intrinsic = partial(extending.intrinsic, target='generic')
diff --git a/rbc/heavydb/heavydb_compiler.py b/rbc/heavydb/heavydb_compiler.py
index 3611d3ef..3412edfb 100644
--- a/rbc/heavydb/heavydb_compiler.py
+++ b/rbc/heavydb/heavydb_compiler.py
@@ -1,8 +1,10 @@
+import warnings
+from functools import partial
 from contextlib import contextmanager
 import llvmlite.binding as llvm
 from rbc.targetinfo import TargetInfo
 from numba.np import ufunc_db
-from numba import _dynfunc
+from numba import _dynfunc, njit
 from numba.core import (
     codegen, compiler_lock, typing,
     base, cpu, utils, descriptors,
@@ -12,6 +14,7 @@
     Generic,
     target_registry,
     dispatcher_registry,
+    jit_registry,
 )
 
 
@@ -28,6 +31,16 @@ class HeavyDB_GPU(Generic):
 target_registry['heavydb_cpu'] = HeavyDB_CPU
 target_registry['heavydb_gpu'] = HeavyDB_GPU
 
+
+def custom_jit(*args, target=None, **kwargs):
+    assert 'target' not in kwargs
+    assert '_target' not in kwargs
+    return njit(*args, _target=target, **kwargs)
+
+
+jit_registry[target_registry['heavydb_cpu']] = partial(custom_jit, target='heavydb_cpu')
+jit_registry[target_registry['heavydb_gpu']] = partial(custom_jit, target='heavydb_gpu')
+
 heavydb_cpu_registry = imputils.Registry(name='heavydb_cpu_registry')
 heavydb_gpu_registry = imputils.Registry(name='heavydb_gpu_registry')
 
@@ -177,16 +190,40 @@ def _get_host_cpu_features(self):
 
         # See https://github.com/xnd-project/rbc/issues/45
         remove_features = {
+            (12, 12): [], (11, 11): [], (10, 10): [], (9, 9): [], (8, 8): [],
             (11, 8): ['tsxldtrk', 'amx-tile', 'amx-bf16', 'serialize', 'amx-int8',
                       'avx512vp2intersect', 'tsxldtrk', 'amx-tile', 'amx-bf16',
                       'serialize', 'amx-int8', 'avx512vp2intersect', 'tsxldtrk',
                       'amx-tile', 'amx-bf16', 'serialize', 'amx-int8',
                       'avx512vp2intersect', 'cx8', 'enqcmd', 'avx512bf16'],
             (11, 10): ['tsxldtrk', 'amx-tile', 'amx-bf16', 'serialize', 'amx-int8'],
+            (9, 11): ['sse2', 'cx16', 'sahf', 'tbm', 'avx512ifma', 'sha',
+                      'gfni', 'fma4', 'vpclmulqdq', 'prfchw', 'bmi2', 'cldemote',
+                      'fsgsbase', 'ptwrite', 'xsavec', 'popcnt', 'mpx',
+                      'avx512bitalg', 'movdiri', 'xsaves', 'avx512er',
+                      'avx512vnni', 'avx512vpopcntdq', 'pconfig', 'clwb',
+                      'avx512f', 'clzero', 'pku', 'mmx', 'lwp', 'rdpid', 'xop',
+                      'rdseed', 'waitpkg', 'movdir64b', 'sse4a', 'avx512bw',
+                      'clflushopt', 'xsave', 'avx512vbmi2', '64bit', 'avx512vl',
+                      'invpcid', 'avx512cd', 'avx', 'vaes', 'cx8', 'fma', 'rtm',
+                      'bmi', 'enqcmd', 'rdrnd', 'mwaitx', 'sse4.1', 'sse4.2', 'avx2',
+                      'fxsr', 'wbnoinvd', 'sse', 'lzcnt', 'pclmul', 'prefetchwt1',
+                      'f16c', 'ssse3', 'sgx', 'shstk', 'cmov', 'avx512vbmi',
+                      'avx512bf16', 'movbe', 'xsaveopt', 'avx512dq', 'adx',
+                      'avx512pf', 'sse3'],
             (9, 8): ['cx8', 'enqcmd', 'avx512bf16'],
         }.get((server_llvm_version[0], client_llvm_version[0]), [])
-        for f in remove_features:
-            features = features.replace('+' + f, '').replace('-' + f, '')
+        if remove_features is None:
+            warnings.warn(
+                f'{type(self).__name__}._get_host_cpu_features: `remove_features` dictionary'
+                ' requires an update: detected different LLVM versions in server '
+                f'{server_llvm_version} and client {client_llvm_version}.'
+                f' CPU features: {features}.')
+        else:
+            features += ','
+            for f in remove_features:
+                features = features.replace('+' + f + ',', '').replace('-' + f + ',', '')
+            features.rstrip(',')
         return features
 
     def _customize_tm_options(self, options):
@@ -207,15 +244,15 @@ class JITRemoteTypingContext(typing.Context):
     """JITRemote Typing Context
     """
 
-    def load_additional_registries(self):
-        from . import mathimpl
-        self.install_registry(mathimpl.registry)
-        return super().load_additional_registries()
+    # def load_additional_registries(self):
+    #     from . import mathimpl
+    #     self.install_registry(mathimpl.registry)
+    #     return super().load_additional_registries()
 
 
 class JITRemoteTargetContext(base.BaseContext):
     # Whether dynamic globals (CPU runtime addresses) is allowed
-    allow_dynamic_globals = True
+    allow_dynamic_globals = True  # should this be False?
 
     def __init__(self, typing_context, target):
         if target not in ('heavydb_cpu', 'heavydb_gpu'):
@@ -258,11 +295,12 @@ def load_additional_registries(self):
 
         # uncomment as needed!
         # from numba.core import optional
-        from numba.np import linalg, polynomial, arraymath, arrayobj  # noqa: F401
+        from numba.np import linalg, polynomial
         # from numba.typed import typeddict, dictimpl
         # from numba.typed import typedlist, listobject
         # from numba.experimental import jitclass, function_type
         # from numba.np import npdatetime
+        from numba.np import arraymath, arrayobj  # noqa: F401
 
         # Add target specific implementations
         from numba.np import npyimpl
@@ -273,7 +311,7 @@ def load_additional_registries(self):
         #     jitclassimpl
         # self.install_registry(cmathimpl.registry)
         # self.install_registry(cffiimpl.registry)
-        self.install_registry(mathimpl.registry)
+        # self.install_registry(mathimpl.registry)
         self.install_registry(npyimpl.registry)
         # self.install_registry(printimpl.registry)
         # self.install_registry(randomimpl.registry)
@@ -334,7 +372,7 @@ def get_executable(self, library, fndesc, env):
         doc = "compiled wrapper for %r" % (fndesc.qualname,)
         cfunc = _dynfunc.make_function(
             fndesc.lookup_module(),
-            fndesc.qualname.split(".")[-1],
+            fndesc.qualname.rsplit(".", 1)[-1],
             doc,
             fnptr,
             env,
diff --git a/rbc/heavydb/mathimpl.py b/rbc/heavydb/mathimpl.py
index 3155179f..47841ca1 100644
--- a/rbc/heavydb/mathimpl.py
+++ b/rbc/heavydb/mathimpl.py
@@ -1,162 +1,203 @@
-import math
-from rbc.externals import gen_codegen
-from numba.core.typing.templates import ConcreteTemplate, signature, Registry
-from numba.types import float32, float64, int32, int64, uint64, intp
-from numba.core.intrinsics import INTR_TO_CMATH
-from .heavydb_compiler import heavydb_cpu_registry, heavydb_gpu_registry
-
-
-lower_cpu = heavydb_cpu_registry.lower
-lower_gpu = heavydb_gpu_registry.lower
-
-
-registry = Registry()
-infer_global = registry.register_global
-
-
-# Adding missing cases in Numba
-@infer_global(math.log2)  # noqa: E302
-class Math_unary(ConcreteTemplate):
-    cases = [
-        signature(float64, int64),
-        signature(float64, uint64),
-        signature(float32, float32),
-        signature(float64, float64),
-    ]
-
-
-@infer_global(math.remainder)
-class Math_remainder(ConcreteTemplate):
-    cases = [
-        signature(float32, float32, float32),
-        signature(float64, float64, float64),
-    ]
-
-
-@infer_global(math.floor)
-@infer_global(math.trunc)
-@infer_global(math.ceil)
-class Math_converter(ConcreteTemplate):
-    cases = [
-        signature(intp, intp),
-        signature(int64, int64),
-        signature(uint64, uint64),
-        signature(float32, float32),
-        signature(float64, float64),
-    ]
-
-
-booleans = []
-booleans += [("isnand", "isnanf", math.isnan)]
-booleans += [("isinfd", "isinff", math.isinf)]
-booleans += [("isfinited", "finitef", math.isfinite)]
-
-unarys = []
-unarys += [("ceil", "ceilf", math.ceil)]
-unarys += [("floor", "floorf", math.floor)]
-unarys += [("fabs", "fabsf", math.fabs)]
-unarys += [("exp", "expf", math.exp)]
-unarys += [("expm1", "expm1f", math.expm1)]
-unarys += [("erf", "erff", math.erf)]
-unarys += [("erfc", "erfcf", math.erfc)]
-unarys += [("tgamma", "tgammaf", math.gamma)]
-unarys += [("lgamma", "lgammaf", math.lgamma)]
-unarys += [("sqrt", "sqrtf", math.sqrt)]
-unarys += [("log", "logf", math.log)]
-unarys += [("log2", "log2f", math.log2)]
-unarys += [("log10", "log10f", math.log10)]
-unarys += [("log1p", "log1pf", math.log1p)]
-unarys += [("acosh", "acoshf", math.acosh)]
-unarys += [("acos", "acosf", math.acos)]
-unarys += [("cos", "cosf", math.cos)]
-unarys += [("cosh", "coshf", math.cosh)]
-unarys += [("asinh", "asinhf", math.asinh)]
-unarys += [("asin", "asinf", math.asin)]
-unarys += [("sin", "sinf", math.sin)]
-unarys += [("sinh", "sinhf", math.sinh)]
-unarys += [("atan", "atanf", math.atan)]
-unarys += [("atanh", "atanhf", math.atanh)]
-unarys += [("tan", "tanf", math.tan)]
-unarys += [("tanh", "tanhf", math.tanh)]
-unarys += [("trunc", "truncf", math.trunc)]
-
-binarys = []
-binarys += [("copysign", "copysignf", math.copysign)]
-binarys += [("atan2", "atan2f", math.atan2)]
-binarys += [("fmod", "fmodf", math.fmod)]
-binarys += [("hypot", "hypotf", math.hypot)]
-binarys += [("remainder", "remainderf", math.remainder)]
-
-
-def impl_unary(fname, key, typ):
-    if fname in INTR_TO_CMATH.values():
-        # use llvm intrinsics when possible
-        cpu = gen_codegen(f'llvm.{fname}')
-    else:
-        cpu = gen_codegen(fname)
-    gpu = gen_codegen(f"__nv_{fname}")
-    lower_cpu(key, typ)(cpu)
-    lower_gpu(key, typ)(gpu)
-
-
-def impl_binary(fname, key, typ):
-    if fname in INTR_TO_CMATH.values():
-        # use llvm intrinsics when possible
-        cpu = gen_codegen(f'llvm.{fname}')
-    else:
-        cpu = gen_codegen(fname)
-    gpu = gen_codegen(f"__nv_{fname}")
-    lower_cpu(key, typ, typ)(cpu)
-    lower_gpu(key, typ, typ)(gpu)
-
-
-for fname64, fname32, key in unarys:
-    impl_unary(fname64, key, float64)
-    impl_unary(fname32, key, float32)
-
-
-for fname64, fname32, key in binarys:
-    impl_binary(fname64, key, float64)
-    impl_binary(fname32, key, float32)
-
-
-# manual mapping
-def impl_ldexp():
-    # cpu
-    ldexp_cpu = gen_codegen('ldexp')
-    ldexpf_cpu = gen_codegen('ldexpf')
-    lower_cpu(math.ldexp, float64, int32)(ldexp_cpu)
-    lower_cpu(math.ldexp, float32, int32)(ldexpf_cpu)
-
-    # gpu
-    ldexp_gpu = gen_codegen('__nv_ldexp')
-    ldexpf_gpu = gen_codegen('__nv_ldexpf')
-    lower_gpu(math.ldexp, float64, int32)(ldexp_gpu)
-    lower_gpu(math.ldexp, float32, int32)(ldexpf_gpu)
-
-
-def impl_pow():
-    # cpu
-    pow_cpu = gen_codegen('pow')
-    powf_cpu = gen_codegen('powf')
-    lower_cpu(math.pow, float64, float64)(pow_cpu)
-    lower_cpu(math.pow, float32, float32)(powf_cpu)
-    lower_cpu(math.pow, float64, int32)(pow_cpu)
-    lower_cpu(math.pow, float32, int32)(powf_cpu)
-
-    # gpu
-    pow_gpu = gen_codegen('__nv_pow')
-    powf_gpu = gen_codegen('__nv_powf')
-    powi_gpu = gen_codegen('__nv_powi')
-    powif_gpu = gen_codegen('__nv_powif')
-    lower_gpu(math.pow, float64, float64)(pow_gpu)
-    lower_gpu(math.pow, float32, float32)(powf_gpu)
-    lower_gpu(math.pow, float64, int32)(powi_gpu)
-    lower_gpu(math.pow, float32, int32)(powif_gpu)
-
-
-impl_ldexp()
-impl_pow()
+# import warnings
+# import math
+# from rbc.externals import gen_codegen
+# from numba.core.typing.templates import ConcreteTemplate, signature, Registry
+# from numba.types import float32, float64, int32, int64, uint64, intp
+# from numba.core.intrinsics import INTR_TO_CMATH
+# from .heavydb_compiler import heavydb_cpu_registry, heavydb_gpu_registry
+
+
+# lower_cpu = heavydb_cpu_registry.lower
+# lower_gpu = heavydb_gpu_registry.lower
+
+
+# registry = Registry()
+# infer_global = registry.register_global
+
+
+# # Adding missing cases in Numba
+# @infer_global(math.log2)  # noqa: E302
+# class Math_unary(ConcreteTemplate):
+#     cases = [
+#         signature(float64, int64),
+#         signature(float64, uint64),
+#         signature(float32, float32),
+#         signature(float64, float64),
+#     ]
+
+
+# @infer_global(math.remainder)
+# class Math_remainder(ConcreteTemplate):
+#     cases = [
+#         signature(float32, float32, float32),
+#         signature(float64, float64, float64),
+#     ]
+
+
+# @infer_global(math.floor)
+# @infer_global(math.trunc)
+# @infer_global(math.ceil)
+# class Math_converter(ConcreteTemplate):
+#     cases = [
+#         signature(intp, intp),
+#         signature(int64, int64),
+#         signature(uint64, uint64),
+#         signature(float32, float32),
+#         signature(float64, float64),
+#     ]
+
+
+# booleans = []
+# booleans += [("isnand", "isnanf", math.isnan)]
+# booleans += [("isinfd", "isinff", math.isinf)]
+# booleans += [("isfinited", "finitef", math.isfinite)]
+
+# unarys = []
+# unarys += [("ceil", "ceilf", math.ceil)]
+# unarys += [("floor", "floorf", math.floor)]
+# unarys += [("fabs", "fabsf", math.fabs)]
+# unarys += [("exp", "expf", math.exp)]
+# unarys += [("expm1", "expm1f", math.expm1)]
+# unarys += [("erf", "erff", math.erf)]
+# unarys += [("erfc", "erfcf", math.erfc)]
+# unarys += [("tgamma", "tgammaf", math.gamma)]
+# unarys += [("lgamma", "lgammaf", math.lgamma)]
+# unarys += [("sqrt", "sqrtf", math.sqrt)]
+# unarys += [("log", "logf", math.log)]
+# unarys += [("log2", "log2f", math.log2)]
+# unarys += [("log10", "log10f", math.log10)]
+# unarys += [("log1p", "log1pf", math.log1p)]
+# unarys += [("acosh", "acoshf", math.acosh)]
+# unarys += [("acos", "acosf", math.acos)]
+# unarys += [("cos", "cosf", math.cos)]
+# unarys += [("cosh", "coshf", math.cosh)]
+# unarys += [("asinh", "asinhf", math.asinh)]
+# unarys += [("asin", "asinf", math.asin)]
+# unarys += [("sin", "sinf", math.sin)]
+# unarys += [("sinh", "sinhf", math.sinh)]
+# unarys += [("atan", "atanf", math.atan)]
+# unarys += [("atanh", "atanhf", math.atanh)]
+# unarys += [("tan", "tanf", math.tan)]
+# unarys += [("tanh", "tanhf", math.tanh)]
+# unarys += [("trunc", "truncf", math.trunc)]
+
+# binarys = []
+# binarys += [("copysign", "copysignf", math.copysign)]
+# binarys += [("atan2", "atan2f", math.atan2)]
+# binarys += [("fmod", "fmodf", math.fmod)]
+# binarys += [("hypot", "hypotf", math.hypot)]
+# binarys += [("remainder", "remainderf", math.remainder)]
+
+
+# rbc_INTR_TO_CMATH = {
+#     "powf": "llvm.pow.f32",
+#     "pow": "llvm.pow.f64",
+
+#     "sinf": "llvm.sin.f32",
+#     "sin": "llvm.sin.f64",
+
+#     "cosf": "llvm.cos.f32",
+#     "cos": "llvm.cos.f64",
+
+#     "sqrtf": "llvm.sqrt.f32",
+#     "sqrt": "llvm.sqrt.f64",
+
+#     "expf": "llvm.exp.f32",
+#     "exp": "llvm.exp.f64",
+
+#     "logf": "llvm.log.f32",
+#     "log": "llvm.log.f64",
+
+#     "log10f": "llvm.log10.f32",
+#     "log10": "llvm.log10.f64",
+
+#     "fabsf": "llvm.fabs.f32",
+#     "fabs": "llvm.fabs.f64",
+
+#     "floorf": "llvm.floor.f32",
+#     "floor": "llvm.floor.f64",
+
+#     "ceilf": "llvm.ceil.f32",
+#     "ceil": "llvm.ceil.f64",
+
+#     "truncf": "llvm.trunc.f32",
+#     "trunc": "llvm.trunc.f64",
+# }
+
+
+# if len(rbc_INTR_TO_CMATH) != len(INTR_TO_CMATH):
+#     warnings.warn("List of intrinsics is outdated! Please update!")
+
+
+# def impl_unary(fname, key, typ):
+#     if fname in rbc_INTR_TO_CMATH.keys():
+#         # use llvm intrinsics when possible
+#         cpu = gen_codegen(rbc_INTR_TO_CMATH.get(fname))
+#     else:
+#         cpu = gen_codegen(fname)
+#     gpu = gen_codegen(f"__nv_{fname}")
+#     lower_cpu(key, typ)(cpu)
+#     lower_gpu(key, typ)(gpu)
+
+
+# def impl_binary(fname, key, typ):
+#     if fname in rbc_INTR_TO_CMATH.keys():
+#         # use llvm intrinsics when possible
+#         cpu = gen_codegen(rbc_INTR_TO_CMATH.get(fname))
+#     else:
+#         cpu = gen_codegen(fname)
+#     gpu = gen_codegen(f"__nv_{fname}")
+#     lower_cpu(key, typ, typ)(cpu)
+#     lower_gpu(key, typ, typ)(gpu)
+
+
+# for fname64, fname32, key in unarys:
+#     impl_unary(fname64, key, float64)
+#     impl_unary(fname32, key, float32)
+
+
+# for fname64, fname32, key in binarys:
+#     impl_binary(fname64, key, float64)
+#     impl_binary(fname32, key, float32)
+
+
+# # manual mapping
+# def impl_ldexp():
+#     # cpu
+#     ldexp_cpu = gen_codegen('ldexp')
+#     ldexpf_cpu = gen_codegen('ldexpf')
+#     lower_cpu(math.ldexp, float64, int32)(ldexp_cpu)
+#     lower_cpu(math.ldexp, float32, int32)(ldexpf_cpu)
+
+#     # gpu
+#     ldexp_gpu = gen_codegen('__nv_ldexp')
+#     ldexpf_gpu = gen_codegen('__nv_ldexpf')
+#     lower_gpu(math.ldexp, float64, int32)(ldexp_gpu)
+#     lower_gpu(math.ldexp, float32, int32)(ldexpf_gpu)
+
+
+# def impl_pow():
+#     # cpu
+#     pow_cpu = gen_codegen('pow')
+#     powf_cpu = gen_codegen('powf')
+#     lower_cpu(math.pow, float64, float64)(pow_cpu)
+#     lower_cpu(math.pow, float32, float32)(powf_cpu)
+#     lower_cpu(math.pow, float64, int32)(pow_cpu)
+#     lower_cpu(math.pow, float32, int32)(powf_cpu)
+
+#     # gpu
+#     pow_gpu = gen_codegen('__nv_pow')
+#     powf_gpu = gen_codegen('__nv_powf')
+#     powi_gpu = gen_codegen('__nv_powi')
+#     powif_gpu = gen_codegen('__nv_powif')
+#     lower_gpu(math.pow, float64, float64)(pow_gpu)
+#     lower_gpu(math.pow, float32, float32)(powf_gpu)
+#     lower_gpu(math.pow, float64, int32)(powi_gpu)
+#     lower_gpu(math.pow, float32, int32)(powif_gpu)
+
+
+# impl_ldexp()
+# impl_pow()
 
 
 # CPU only:
diff --git a/rbc/heavydb/remoteheavydb.py b/rbc/heavydb/remoteheavydb.py
index 36c31cd5..34bec742 100644
--- a/rbc/heavydb/remoteheavydb.py
+++ b/rbc/heavydb/remoteheavydb.py
@@ -1530,5 +1530,5 @@ def remote_call(self, func, ftype: typesystem.Type, arguments: tuple, hold=False
 
 
 class RemoteOmnisci(RemoteHeavyDB):
-    """HeavyDB - the previous brand of HeavyAI
+    """Omnisci - the previous brand of HeavyAI
     """
diff --git a/rbc/tests/__init__.py b/rbc/tests/__init__.py
index bdf9615f..49f8c4e6 100644
--- a/rbc/tests/__init__.py
+++ b/rbc/tests/__init__.py
@@ -144,6 +144,30 @@ def values(self):
         }
 
 
+class _mathTestTable(_DefaultTestTable):
+
+    @classmethod
+    def suffix(cls):
+        return "math"
+
+    @property
+    def sqltypes(self):
+        return ('BOOLEAN', 'BOOLEAN', 'DOUBLE', 'DOUBLE', 'DOUBLE', 'INT',
+                'INT')
+
+    @property
+    def values(self):
+        return {
+            'a': [False, False, True, False, False],
+            'b': [False, True, False, True, False],
+            'x': [0.123 + 1/10.0, 0.123 + 2/10.0, 0.123 + 3/10.0, 0.123 + 4/10.0, 0.123 + 5/10.0],
+            'y': [1/6.0, 2/6.0, 3/6.0, 4/6.0, 5/6.0],
+            'z': [2.23, 3.23, 4.23, 5.23, 6.23],
+            'i': [1, 2, 3, 4, 5],
+            'j': [10, 20, 30, 40, 50],
+        }
+
+
 class _arraynullTestTable(_arrayTestTable):
 
     @classmethod
@@ -221,7 +245,7 @@ def values(self):
 
 
 def heavydb_fixture(caller_globals, minimal_version=(0, 0),
-                    suffices=['', '10', 'null', 'array', 'arraynull', 'text', 'timestamp'],
+                    suffices=['', '10', 'null', 'array', 'arraynull', 'text', 'timestamp', 'math'],
                     load_test_data=True, debug=False):
     """Usage from a rbc/tests/test_xyz.py file:
 
@@ -255,6 +279,8 @@ def heavydb():
                                        i1, b with row size 5, contains null
                                        values.
 
+    f'{heavydb.table_name}math' - contains scalar values for math operations,
+
     f'{heavydb.table_name}text' - contains text t4, t2, t1, s, n
                                   where 't' prefix is for text encoding dict
                                   and 'n' is for text encoding none.
@@ -370,7 +396,8 @@ def require_version(version, message=None, label=None):
     # MULTIPOLYGON, See
     # https://docs.heavy.ai/sql/data-definition-ddl/datatypes-and-fixed-encoding
     for cls in (_DefaultTestTable, _10TestTable, _nullTestTable, _arrayTestTable,
-                _arraynullTestTable, _TextTestTable, _TimestampTestTable):
+                _arraynullTestTable, _TextTestTable, _TimestampTestTable,
+                _mathTestTable):
         suffix = cls.suffix()
         if suffix in suffices:
             obj = cls()
diff --git a/rbc/tests/heavydb/test_array_math.py b/rbc/tests/heavydb/test_array_math.py
index a730d76d..1f2f3a9f 100644
--- a/rbc/tests/heavydb/test_array_math.py
+++ b/rbc/tests/heavydb/test_array_math.py
@@ -114,6 +114,7 @@ def test_heavydb_array_binary_math(heavydb, method, signature, columns):
     heavydb.reset()
 
     s = f'def np_{method}(a, b): return array_api.{method}(a, b)'
+    print(s)
     exec(s, globals())
 
     heavydb(signature)(eval('np_{}'.format(method)))
diff --git a/rbc/tests/heavydb/test_column_basic.py b/rbc/tests/heavydb/test_column_basic.py
index 299a8a91..f44dced4 100644
--- a/rbc/tests/heavydb/test_column_basic.py
+++ b/rbc/tests/heavydb/test_column_basic.py
@@ -829,8 +829,8 @@ def test_issue343(heavydb):
     # to switch the target context from CPU to GPU, so that functions
     # are bind to the correct target. In the case below, math.exp
     # is bind to '@llvm.exp.f64' on CPU and '@__nv_exp' on GPU.
-    if not heavydb.has_cuda:
-        pytest.skip('test requires heavydb build with GPU support')
+    if not (heavydb.has_cuda and heavydb.has_libdevice):
+        pytest.skip('test requires heavydb build with GPU support and libdevice')
 
     @njit
     def bar(x):
@@ -892,3 +892,17 @@ def col_enumerate(x, y):
         f'select rowid, i4 from {heavydb.table_name} order by rowid;')
     for (r,), (_, e) in zip(list(result), list(expected_result)):
         assert r == e
+
+
+def test_foo(heavydb):
+    from rbc.externals.heavydb import set_output_row_size
+
+    @heavydb('int32(Column<int32>, OutputColumn<int32>)')
+    def col_enumerate(x, y):
+        sz = len(x)
+        # set_output_row_size(sz)
+        # for i, e in enumerate(x):
+        #     y[i] = e
+        return sz
+
+    heavydb.register()
diff --git a/rbc/tests/heavydb/test_heavydb.py b/rbc/tests/heavydb/test_heavydb.py
index 74b10918..76d8c091 100644
--- a/rbc/tests/heavydb/test_heavydb.py
+++ b/rbc/tests/heavydb/test_heavydb.py
@@ -359,7 +359,7 @@ def test_casting(heavydb):
     The following table defines the behavior of applying these UDFs to
     values with different types:
 
-    HeavyDB version 5.9+
+    OmnisciDB version 5.9+
     ----------------------
              | Functions applied to <itype value>
     itype    | i8   | i16  | i32  | i64  | f32  | f64  |
@@ -371,30 +371,6 @@ def test_casting(heavydb):
     float    | FAIL | FAIL | FAIL | FAIL | OK   | OK   |
     double   | FAIL | FAIL | FAIL | FAIL | FAIL | OK   |
 
-    HeavyDB version 5.8
-    ----------------------
-             | Functions applied to <itype value>
-    itype    | i8   | i16  | i32  | i64  | f32  | f64  |
-    ---------+------+------+------+------+------+------+
-    tinyint  | OK   | OK   | OK   | OK   | OK   | OK   |
-    smallint | FAIL | OK   | OK   | OK   | OK   | OK   |
-    int      | FAIL | FAIL | OK   | OK   | OK   | OK   |
-    bigint   | FAIL | FAIL | FAIL | OK   | FAIL | OK   |
-    float    | FAIL | FAIL | FAIL | FAIL | OK   | OK   |
-    double   | FAIL | FAIL | FAIL | FAIL | FAIL | OK   |
-
-    HeavyDB version 5.7 and older
-    -------------------------------
-             | Functions applied to <itype value>
-    itype    | i8   | i16  | i32  | i64  | f32  | f64  |
-    ---------+------+------+------+------+------+------+
-    tinyint  | OK   | OK   | OK   | OK   | FAIL | FAIL |
-    smallint | FAIL | OK   | OK   | OK   | FAIL | FAIL |
-    int      | FAIL | FAIL | OK   | OK   | FAIL | FAIL |
-    bigint   | FAIL | FAIL | FAIL | OK   | FAIL | FAIL |
-    float    | FAIL | FAIL | FAIL | FAIL | OK   | OK   |
-    double   | FAIL | FAIL | FAIL | FAIL | FAIL | OK   |
-
     test_binding is superior test with respect to successful UDF
     executions but it does not check exception messages.
     """
diff --git a/rbc/tests/heavydb/test_math.py b/rbc/tests/heavydb/test_math.py
index 78ea68a3..7ad019e2 100644
--- a/rbc/tests/heavydb/test_math.py
+++ b/rbc/tests/heavydb/test_math.py
@@ -4,51 +4,19 @@
 import numpy as np
 
 import rbc.heavydb as rbc_heavydb
+from rbc.heavydb.mathimpl import rbc_INTR_TO_CMATH
 from rbc.stdlib import array_api
+from rbc.tests import heavydb_fixture
 
 available_version, reason = rbc_heavydb.is_available()
 pytestmark = pytest.mark.skipif(not available_version, reason=reason)
 
 
-@pytest.fixture(scope='module')
-def nb_version():
-    from rbc.utils import get_version
-    return get_version('numba')
-
-
 @pytest.fixture(scope='module')
 def heavydb():
-    # TODO: use heavydb_fixture from rbc/tests/__init__.py
-    config = rbc_heavydb.get_client_config(debug=not True)
-    m = rbc_heavydb.RemoteHeavyDB(**config)
-    table_name = 'rbc_test_heavydb_math'
-
-    m.sql_execute(f'DROP TABLE IF EXISTS {table_name}')
-
-    m.sql_execute(
-        f'CREATE TABLE IF NOT EXISTS {table_name}'
-        ' (a BOOLEAN, b BOOLEAN, x DOUBLE, y DOUBLE, z DOUBLE, i INT, '
-        'j INT, t INT[], td DOUBLE[], te INT[]);')
-
-    for _i in range(1, 6):
-        a = str((_i % 3) == 0).lower()
-        b = str((_i % 2) == 0).lower()
-        x = 0.123 + _i/10.0
-        y = _i/6.0
-        z = _i + 1.23
-        i = _i
-        j = i * 10
-        t = 'ARRAY[%s]' % (', '.join(str(j + i) for i in range(-i, i+1)))
-        td = 'ARRAY[%s]' % (', '.join(str(j + i/1.0) for i in range(-i, i+1)))
-        te = 'Array[]'
-        m.sql_execute(
-            f'insert into {table_name} values (\'{a}\', \'{b}\', {x}, {y},'
-            f' {z}, {i}, {j}, {t}, {td}, {te})')
-
-    m.table_name = table_name
-    yield m
-
-    m.sql_execute(f'DROP TABLE IF EXISTS {table_name}')
+    for o in heavydb_fixture(globals(), debug=False,
+                             suffices=['math']):
+        yield o
 
 
 math_functions = [
@@ -83,7 +51,7 @@ def heavydb():
     ('log10', 'double(double)'),
     ('pow', 'double(double, double)'),
     ('sqrt', 'double(double)'),
-    # # Trigonometric functions
+    # Trigonometric functions
     ('acos', 'double(double)'),
     ('asin', 'double(double)'),
     ('atan', 'double(double)'),
@@ -95,14 +63,14 @@ def heavydb():
     ('tan', 'double(double)'),
     ('degrees', 'double(double)'),
     ('radians', 'double(double)'),
-    # # Hyperbolic functions
+    # Hyperbolic functions
     ('acosh', 'double(double)'),
     ('asinh', 'double(double)'),
     ('atanh', 'double(double)'),
     ('cosh', 'double(double)'),
     ('sinh', 'double(double)'),
     ('tanh', 'double(double)'),
-    # # Special functions
+    # Special functions
     ('erf', 'double(double)'),
     ('erfc', 'double(double)'),
     ('gamma', 'double(double)'),
@@ -122,7 +90,7 @@ def heavydb():
 @pytest.mark.parametrize("device", devices)
 @pytest.mark.parametrize("fn_name, signature", math_functions,
                          ids=["math." + item[0] for item in math_functions])
-def test_math_function(heavydb, device, nb_version, fn_name, signature):
+def test_math_function(heavydb, device, fn_name, signature):
     heavydb.reset()
 
     if not heavydb.has_cuda and device == 'gpu':
@@ -188,7 +156,7 @@ def test_math_function(heavydb, device, nb_version, fn_name, signature):
     if fn_name in ['ldexp']:
         xs = 'x, i'
 
-    query = f'select {xs}, {fprefix}{fn_name}({xs}) from {heavydb.table_name}'
+    query = f'select {xs}, {fprefix}{fn_name}({xs}) from {heavydb.table_name}math'
     descr, result = heavydb.sql_execute(query)
     for args in list(result):
         result = args[-1]
@@ -321,7 +289,7 @@ def test_math_function(heavydb, device, nb_version, fn_name, signature):
 @pytest.mark.parametrize("device", devices)
 @pytest.mark.parametrize("fn_name, signature, np_func", numpy_functions,
                          ids=["np." + item[0] for item in numpy_functions])
-def test_numpy_function(heavydb, device, nb_version, fn_name, signature, np_func):
+def test_numpy_function(heavydb, device, fn_name, signature, np_func):
     heavydb.reset()
 
     if not heavydb.has_cuda and device == 'gpu':
@@ -381,7 +349,7 @@ def test_numpy_function(heavydb, device, nb_version, fn_name, signature, np_func
     else:
         raise NotImplementedError(kind)
 
-    query = f'select {xs}, {fn_name}({xs}) from {heavydb.table_name}'
+    query = f'select {xs}, {fn_name}({xs}) from {heavydb.table_name}math'
     descr, result = heavydb.sql_execute(query)
     for args in list(result):
         result = args[-1]
diff --git a/utils/client_ssh_tunnel.conf b/utils/client_ssh_tunnel.conf
index 52224310..07e4dd50 100644
--- a/utils/client_ssh_tunnel.conf
+++ b/utils/client_ssh_tunnel.conf
@@ -3,14 +3,14 @@
 # ssh tunneling.
 #
 # Usage:
-#   1. Run omnscidb server with ssh port forwarding::
+#   1. Run heavydb server with ssh port forwarding::
 #
 #      ssh -L 6274:127.0.0.1:16274  <remote host name or IP>
-#      bin/omnisci_server --enable-dev-table-functions --enable-runtime-udf --enable-table-functions -p 16274 --http-port 16278 --calcite-port 16279
+#      bin/heavydb --enable-dev-table-functions --enable-runtime-udf --enable-table-functions -p 16274 --http-port 16278 --calcite-port 16279
 #
-#   2. Relate the omniscidb server to client:
+#   2. Relate the heavydb server to client:
 #
-#      export OMNISCI_CLIENT_CONF=utils/client_ssh_tunnel.conf
+#      export HEAVYDB_CLIENT_CONF=utils/client_ssh_tunnel.conf
 #
 
 [server]

From cb171eda4a7ecb20b2f9fe52372d3398d32a54a4 Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <guilhermeleobas@gmail.com>
Date: Thu, 9 Mar 2023 16:14:45 -0300
Subject: [PATCH 3/3] correctly inherit target from numba CPU or GPU
 implementation

---
 rbc/heavydb/buffer.py                  |   3 +-
 rbc/heavydb/extending.py               |   8 -
 rbc/heavydb/heavydb_compiler.py        |  92 +++---
 rbc/heavydb/mathimpl.py                | 372 ++++++++++++-------------
 rbc/heavydb/remoteheavydb.py           |   2 +-
 rbc/irtools.py                         |  10 +-
 rbc/tests/heavydb/test_column_basic.py |  14 -
 rbc/tests/heavydb/test_math.py         |   9 +-
 8 files changed, 249 insertions(+), 261 deletions(-)
 delete mode 100644 rbc/heavydb/extending.py

diff --git a/rbc/heavydb/buffer.py b/rbc/heavydb/buffer.py
index d567852e..1359d1c1 100644
--- a/rbc/heavydb/buffer.py
+++ b/rbc/heavydb/buffer.py
@@ -30,8 +30,7 @@
 import numpy as np
 from rbc import typesystem
 from rbc.targetinfo import TargetInfo
-from rbc.heavydb import extending
-from numba.core import datamodel, cgutils, types, imputils
+from numba.core import datamodel, cgutils, types, imputils, extending
 
 int8_t = ir.IntType(8)
 int32_t = ir.IntType(32)
diff --git a/rbc/heavydb/extending.py b/rbc/heavydb/extending.py
deleted file mode 100644
index 7a97b8a0..00000000
--- a/rbc/heavydb/extending.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from functools import partial
-from numba.core import extending
-from numba.core.extending import lower_builtin
-
-overload = partial(extending.overload, target='generic')
-overload_method = partial(extending.overload_method, target='generic')
-overload_attribute = partial(extending.overload_attribute, target='generic')
-intrinsic = partial(extending.intrinsic, target='generic')
diff --git a/rbc/heavydb/heavydb_compiler.py b/rbc/heavydb/heavydb_compiler.py
index 3412edfb..5865a51d 100644
--- a/rbc/heavydb/heavydb_compiler.py
+++ b/rbc/heavydb/heavydb_compiler.py
@@ -1,21 +1,18 @@
 import warnings
-from functools import partial
 from contextlib import contextmanager
+from functools import partial
+
 import llvmlite.binding as llvm
-from rbc.targetinfo import TargetInfo
-from numba.np import ufunc_db
 from numba import _dynfunc, njit
-from numba.core import (
-    codegen, compiler_lock, typing,
-    base, cpu, utils, descriptors,
-    dispatcher, callconv, imputils,
-    options,)
-from numba.core.target_extension import (
-    Generic,
-    target_registry,
-    dispatcher_registry,
-    jit_registry,
-)
+from numba.core import (base, callconv, codegen, compiler_lock, cpu,
+                        descriptors, dispatcher, imputils, options, typing,
+                        utils)
+from numba.core.target_extension import (Generic, dispatcher_registry,
+                                         jit_registry, target_registry)
+from numba.cuda.target import CUDATypingContext
+from numba.np import ufunc_db
+
+from rbc.targetinfo import TargetInfo
 
 
 class HeavyDB_CPU(Generic):
@@ -41,8 +38,9 @@ def custom_jit(*args, target=None, **kwargs):
 jit_registry[target_registry['heavydb_cpu']] = partial(custom_jit, target='heavydb_cpu')
 jit_registry[target_registry['heavydb_gpu']] = partial(custom_jit, target='heavydb_gpu')
 
-heavydb_cpu_registry = imputils.Registry(name='heavydb_cpu_registry')
-heavydb_gpu_registry = imputils.Registry(name='heavydb_gpu_registry')
+
+# heavydb_cpu_registry = imputils.Registry(name='heavydb_cpu_registry')
+# heavydb_gpu_registry = imputils.Registry(name='heavydb_gpu_registry')
 
 
 class _NestedContext(object):
@@ -101,7 +99,8 @@ def _toplevel_target_context(self):
     @utils.cached_property
     def _toplevel_typing_context(self):
         # Lazily-initialized top-level typing context, for all threads
-        return JITRemoteTypingContext()
+        return {'heavydb_cpu': JITRemoteCPUTypingContext,
+                'heavydb_gpu': JITRemoteGPUTypingContext}[self._target_name]()
 
     @property
     def target_context(self):
@@ -240,14 +239,14 @@ def set_env(self, env_name, env):
         return None
 
 
-class JITRemoteTypingContext(typing.Context):
+class JITRemoteCPUTypingContext(typing.Context):
     """JITRemote Typing Context
     """
 
-    # def load_additional_registries(self):
-    #     from . import mathimpl
-    #     self.install_registry(mathimpl.registry)
-    #     return super().load_additional_registries()
+
+class JITRemoteGPUTypingContext(CUDATypingContext):
+    """JITRemote Typing Context
+    """
 
 
 class JITRemoteTargetContext(base.BaseContext):
@@ -268,43 +267,53 @@ def init(self):
         self._target_data = llvm.create_target_data(target_info.datalayout)
 
     def refresh(self):
-        if self.target_name == 'heavydb_cpu':
-            registry = heavydb_cpu_registry
-        else:
-            registry = heavydb_gpu_registry
-
-        try:
-            loader = self._registries[registry]
-        except KeyError:
-            loader = imputils.RegistryLoader(registry)
-            self._registries[registry] = loader
-
-        self.install_registry(registry)
+        # if self.target_name == 'heavydb_cpu':
+        #     registry = heavydb_cpu_registry
+        # else:
+        #     registry = heavydb_gpu_registry
+
+        # try:
+        #     loader = self._registries[registry]
+        # except KeyError:
+        #     loader = imputils.RegistryLoader(registry)
+        #     self._registries[registry] = loader
+
+        # self.install_registry(registry)
         # Also refresh typing context, since @overload declarations can
         # affect it.
-        self.typing_context.refresh()
+        # self.typing_context.refresh()
         super().refresh()
 
     def load_additional_registries(self):
         # Add implementations that work via import
-        from numba.cpython import (builtins, charseq, enumimpl, hashing, heapq,  # noqa: F401
-                                   iterators, listobj, numbers, rangeobj,
-                                   setobj, slicing, tupleobj, unicode,)
+        from numba.cpython import (builtins, charseq, enumimpl,  # noqa: F401
+                                   hashing, heapq, iterators, listobj, numbers,
+                                   rangeobj, setobj, slicing, tupleobj,
+                                   unicode)
 
         self.install_registry(imputils.builtin_registry)
 
         # uncomment as needed!
         # from numba.core import optional
-        from numba.np import linalg, polynomial
+        # from numba.np import linalg, polynomial
         # from numba.typed import typeddict, dictimpl
         # from numba.typed import typedlist, listobject
         # from numba.experimental import jitclass, function_type
         # from numba.np import npdatetime
-        from numba.np import arraymath, arrayobj  # noqa: F401
+        # from numba.np import arraymath, arrayobj  # noqa: F401
+
+        # from rbc.heavydb import mathimpl
 
         # Add target specific implementations
-        from numba.np import npyimpl
         from numba.cpython import mathimpl
+        from numba.cuda import mathimpl as cuda_mathimpl
+        from numba.np import npyimpl
+
+        if self.target_name == 'heavydb_cpu':
+            self.install_registry(npyimpl.registry)
+            self.install_registry(mathimpl.registry)
+        else:
+            self.install_registry(cuda_mathimpl.registry)
         # from numba.cpython import cmathimpl, mathimpl, printimpl, randomimpl
         # from numba.misc import cffiimpl
         # from numba.experimental.jitclass.base import ClassBuilder as \
@@ -312,7 +321,6 @@ def load_additional_registries(self):
         # self.install_registry(cmathimpl.registry)
         # self.install_registry(cffiimpl.registry)
         # self.install_registry(mathimpl.registry)
-        self.install_registry(npyimpl.registry)
         # self.install_registry(printimpl.registry)
         # self.install_registry(randomimpl.registry)
         # self.install_registry(jitclassimpl.class_impl_registry)
diff --git a/rbc/heavydb/mathimpl.py b/rbc/heavydb/mathimpl.py
index 47841ca1..ce6f4377 100644
--- a/rbc/heavydb/mathimpl.py
+++ b/rbc/heavydb/mathimpl.py
@@ -1,203 +1,203 @@
-# import warnings
-# import math
-# from rbc.externals import gen_codegen
-# from numba.core.typing.templates import ConcreteTemplate, signature, Registry
-# from numba.types import float32, float64, int32, int64, uint64, intp
-# from numba.core.intrinsics import INTR_TO_CMATH
+import warnings
+import math
+from rbc.externals import gen_codegen
+from numba.core.typing.templates import ConcreteTemplate, signature, infer_global
+from numba.core.types import float32, float64, int32, int64, uint64, intp
+from numba.core.intrinsics import INTR_TO_CMATH
+from numba.core.extending import lower_builtin as lower_cpu
+from numba.cuda.mathimpl import lower as lower_gpu  # noqa: F401
 # from .heavydb_compiler import heavydb_cpu_registry, heavydb_gpu_registry
 
-
 # lower_cpu = heavydb_cpu_registry.lower
 # lower_gpu = heavydb_gpu_registry.lower
 
-
 # registry = Registry()
 # infer_global = registry.register_global
 
 
-# # Adding missing cases in Numba
-# @infer_global(math.log2)  # noqa: E302
-# class Math_unary(ConcreteTemplate):
-#     cases = [
-#         signature(float64, int64),
-#         signature(float64, uint64),
-#         signature(float32, float32),
-#         signature(float64, float64),
-#     ]
-
-
-# @infer_global(math.remainder)
-# class Math_remainder(ConcreteTemplate):
-#     cases = [
-#         signature(float32, float32, float32),
-#         signature(float64, float64, float64),
-#     ]
-
-
-# @infer_global(math.floor)
-# @infer_global(math.trunc)
-# @infer_global(math.ceil)
-# class Math_converter(ConcreteTemplate):
-#     cases = [
-#         signature(intp, intp),
-#         signature(int64, int64),
-#         signature(uint64, uint64),
-#         signature(float32, float32),
-#         signature(float64, float64),
-#     ]
-
-
-# booleans = []
-# booleans += [("isnand", "isnanf", math.isnan)]
-# booleans += [("isinfd", "isinff", math.isinf)]
-# booleans += [("isfinited", "finitef", math.isfinite)]
-
-# unarys = []
-# unarys += [("ceil", "ceilf", math.ceil)]
-# unarys += [("floor", "floorf", math.floor)]
-# unarys += [("fabs", "fabsf", math.fabs)]
-# unarys += [("exp", "expf", math.exp)]
-# unarys += [("expm1", "expm1f", math.expm1)]
-# unarys += [("erf", "erff", math.erf)]
-# unarys += [("erfc", "erfcf", math.erfc)]
-# unarys += [("tgamma", "tgammaf", math.gamma)]
-# unarys += [("lgamma", "lgammaf", math.lgamma)]
-# unarys += [("sqrt", "sqrtf", math.sqrt)]
-# unarys += [("log", "logf", math.log)]
-# unarys += [("log2", "log2f", math.log2)]
-# unarys += [("log10", "log10f", math.log10)]
-# unarys += [("log1p", "log1pf", math.log1p)]
-# unarys += [("acosh", "acoshf", math.acosh)]
-# unarys += [("acos", "acosf", math.acos)]
-# unarys += [("cos", "cosf", math.cos)]
-# unarys += [("cosh", "coshf", math.cosh)]
-# unarys += [("asinh", "asinhf", math.asinh)]
-# unarys += [("asin", "asinf", math.asin)]
-# unarys += [("sin", "sinf", math.sin)]
-# unarys += [("sinh", "sinhf", math.sinh)]
-# unarys += [("atan", "atanf", math.atan)]
-# unarys += [("atanh", "atanhf", math.atanh)]
-# unarys += [("tan", "tanf", math.tan)]
-# unarys += [("tanh", "tanhf", math.tanh)]
-# unarys += [("trunc", "truncf", math.trunc)]
-
-# binarys = []
-# binarys += [("copysign", "copysignf", math.copysign)]
-# binarys += [("atan2", "atan2f", math.atan2)]
-# binarys += [("fmod", "fmodf", math.fmod)]
-# binarys += [("hypot", "hypotf", math.hypot)]
-# binarys += [("remainder", "remainderf", math.remainder)]
-
-
-# rbc_INTR_TO_CMATH = {
-#     "powf": "llvm.pow.f32",
-#     "pow": "llvm.pow.f64",
-
-#     "sinf": "llvm.sin.f32",
-#     "sin": "llvm.sin.f64",
-
-#     "cosf": "llvm.cos.f32",
-#     "cos": "llvm.cos.f64",
-
-#     "sqrtf": "llvm.sqrt.f32",
-#     "sqrt": "llvm.sqrt.f64",
-
-#     "expf": "llvm.exp.f32",
-#     "exp": "llvm.exp.f64",
-
-#     "logf": "llvm.log.f32",
-#     "log": "llvm.log.f64",
-
-#     "log10f": "llvm.log10.f32",
-#     "log10": "llvm.log10.f64",
-
-#     "fabsf": "llvm.fabs.f32",
-#     "fabs": "llvm.fabs.f64",
-
-#     "floorf": "llvm.floor.f32",
-#     "floor": "llvm.floor.f64",
-
-#     "ceilf": "llvm.ceil.f32",
-#     "ceil": "llvm.ceil.f64",
-
-#     "truncf": "llvm.trunc.f32",
-#     "trunc": "llvm.trunc.f64",
-# }
-
-
-# if len(rbc_INTR_TO_CMATH) != len(INTR_TO_CMATH):
-#     warnings.warn("List of intrinsics is outdated! Please update!")
-
-
-# def impl_unary(fname, key, typ):
-#     if fname in rbc_INTR_TO_CMATH.keys():
-#         # use llvm intrinsics when possible
-#         cpu = gen_codegen(rbc_INTR_TO_CMATH.get(fname))
-#     else:
-#         cpu = gen_codegen(fname)
-#     gpu = gen_codegen(f"__nv_{fname}")
-#     lower_cpu(key, typ)(cpu)
-#     lower_gpu(key, typ)(gpu)
-
-
-# def impl_binary(fname, key, typ):
-#     if fname in rbc_INTR_TO_CMATH.keys():
-#         # use llvm intrinsics when possible
-#         cpu = gen_codegen(rbc_INTR_TO_CMATH.get(fname))
-#     else:
-#         cpu = gen_codegen(fname)
-#     gpu = gen_codegen(f"__nv_{fname}")
-#     lower_cpu(key, typ, typ)(cpu)
-#     lower_gpu(key, typ, typ)(gpu)
-
-
-# for fname64, fname32, key in unarys:
-#     impl_unary(fname64, key, float64)
-#     impl_unary(fname32, key, float32)
-
-
-# for fname64, fname32, key in binarys:
-#     impl_binary(fname64, key, float64)
-#     impl_binary(fname32, key, float32)
-
-
-# # manual mapping
-# def impl_ldexp():
-#     # cpu
-#     ldexp_cpu = gen_codegen('ldexp')
-#     ldexpf_cpu = gen_codegen('ldexpf')
-#     lower_cpu(math.ldexp, float64, int32)(ldexp_cpu)
-#     lower_cpu(math.ldexp, float32, int32)(ldexpf_cpu)
-
-#     # gpu
-#     ldexp_gpu = gen_codegen('__nv_ldexp')
-#     ldexpf_gpu = gen_codegen('__nv_ldexpf')
-#     lower_gpu(math.ldexp, float64, int32)(ldexp_gpu)
-#     lower_gpu(math.ldexp, float32, int32)(ldexpf_gpu)
+# Adding missing cases in Numba
+@infer_global(math.log2)  # noqa: E302
+class Math_unary(ConcreteTemplate):
+    cases = [
+        signature(float64, int64),
+        signature(float64, uint64),
+        signature(float32, float32),
+        signature(float64, float64),
+    ]
+
+
+@infer_global(math.remainder)
+class Math_remainder(ConcreteTemplate):
+    cases = [
+        signature(float32, float32, float32),
+        signature(float64, float64, float64),
+    ]
+
+
+@infer_global(math.floor)
+@infer_global(math.trunc)
+@infer_global(math.ceil)
+class Math_converter(ConcreteTemplate):
+    cases = [
+        signature(intp, intp),
+        signature(int64, int64),
+        signature(uint64, uint64),
+        signature(float32, float32),
+        signature(float64, float64),
+    ]
+
+
+booleans = []
+booleans += [("isnand", "isnanf", math.isnan)]
+booleans += [("isinfd", "isinff", math.isinf)]
+booleans += [("isfinited", "finitef", math.isfinite)]
+
+unarys = []
+unarys += [("ceil", "ceilf", math.ceil)]
+unarys += [("floor", "floorf", math.floor)]
+unarys += [("fabs", "fabsf", math.fabs)]
+unarys += [("exp", "expf", math.exp)]
+unarys += [("expm1", "expm1f", math.expm1)]
+unarys += [("erf", "erff", math.erf)]
+unarys += [("erfc", "erfcf", math.erfc)]
+unarys += [("tgamma", "tgammaf", math.gamma)]
+unarys += [("lgamma", "lgammaf", math.lgamma)]
+unarys += [("sqrt", "sqrtf", math.sqrt)]
+unarys += [("log", "logf", math.log)]
+unarys += [("log2", "log2f", math.log2)]
+unarys += [("log10", "log10f", math.log10)]
+unarys += [("log1p", "log1pf", math.log1p)]
+unarys += [("acosh", "acoshf", math.acosh)]
+unarys += [("acos", "acosf", math.acos)]
+unarys += [("cos", "cosf", math.cos)]
+unarys += [("cosh", "coshf", math.cosh)]
+unarys += [("asinh", "asinhf", math.asinh)]
+unarys += [("asin", "asinf", math.asin)]
+unarys += [("sin", "sinf", math.sin)]
+unarys += [("sinh", "sinhf", math.sinh)]
+unarys += [("atan", "atanf", math.atan)]
+unarys += [("atanh", "atanhf", math.atanh)]
+unarys += [("tan", "tanf", math.tan)]
+unarys += [("tanh", "tanhf", math.tanh)]
+unarys += [("trunc", "truncf", math.trunc)]
+
+binarys = []
+binarys += [("copysign", "copysignf", math.copysign)]
+binarys += [("atan2", "atan2f", math.atan2)]
+binarys += [("fmod", "fmodf", math.fmod)]
+binarys += [("hypot", "hypotf", math.hypot)]
+binarys += [("remainder", "remainderf", math.remainder)]
+
+
+rbc_INTR_TO_CMATH = {
+    "powf": "llvm.pow.f32",
+    "pow": "llvm.pow.f64",
+
+    "sinf": "llvm.sin.f32",
+    "sin": "llvm.sin.f64",
+
+    "cosf": "llvm.cos.f32",
+    "cos": "llvm.cos.f64",
+
+    "sqrtf": "llvm.sqrt.f32",
+    "sqrt": "llvm.sqrt.f64",
+
+    "expf": "llvm.exp.f32",
+    "exp": "llvm.exp.f64",
+
+    "logf": "llvm.log.f32",
+    "log": "llvm.log.f64",
+
+    "log10f": "llvm.log10.f32",
+    "log10": "llvm.log10.f64",
+
+    "fabsf": "llvm.fabs.f32",
+    "fabs": "llvm.fabs.f64",
+
+    "floorf": "llvm.floor.f32",
+    "floor": "llvm.floor.f64",
+
+    "ceilf": "llvm.ceil.f32",
+    "ceil": "llvm.ceil.f64",
+
+    "truncf": "llvm.trunc.f32",
+    "trunc": "llvm.trunc.f64",
+}
+
+
+if len(rbc_INTR_TO_CMATH) != len(INTR_TO_CMATH):
+    warnings.warn("List of intrinsics is outdated! Please update!")
+
+
+def impl_unary(fname, key, typ):
+    if fname in rbc_INTR_TO_CMATH.keys():
+        # use llvm intrinsics when possible
+        cpu = gen_codegen(rbc_INTR_TO_CMATH.get(fname))
+    else:
+        cpu = gen_codegen(fname)
+    # gpu = gen_codegen(f"__nv_{fname}")
+    lower_cpu(key, typ)(cpu)
+    # lower_gpu(key, typ)(gpu)
+
+
+def impl_binary(fname, key, typ):
+    if fname in rbc_INTR_TO_CMATH.keys():
+        # use llvm intrinsics when possible
+        cpu = gen_codegen(rbc_INTR_TO_CMATH.get(fname))
+    else:
+        cpu = gen_codegen(fname)
+    # gpu = gen_codegen(f"__nv_{fname}")
+    lower_cpu(key, typ, typ)(cpu)
+    # lower_gpu(key, typ, typ)(gpu)
+
+
+for fname64, fname32, key in unarys:
+    impl_unary(fname64, key, float64)
+    impl_unary(fname32, key, float32)
+
+
+for fname64, fname32, key in binarys:
+    impl_binary(fname64, key, float64)
+    impl_binary(fname32, key, float32)
+
+
+# manual mapping
+def impl_ldexp():
+    # cpu
+    ldexp_cpu = gen_codegen('ldexp')
+    ldexpf_cpu = gen_codegen('ldexpf')
+    lower_cpu(math.ldexp, float64, int32)(ldexp_cpu)
+    lower_cpu(math.ldexp, float32, int32)(ldexpf_cpu)
+
+    # gpu
+    # ldexp_gpu = gen_codegen('__nv_ldexp')
+    # ldexpf_gpu = gen_codegen('__nv_ldexpf')
+    # lower_gpu(math.ldexp, float64, int32)(ldexp_gpu)
+    # lower_gpu(math.ldexp, float32, int32)(ldexpf_gpu)
 
 
-# def impl_pow():
-#     # cpu
-#     pow_cpu = gen_codegen('pow')
-#     powf_cpu = gen_codegen('powf')
-#     lower_cpu(math.pow, float64, float64)(pow_cpu)
-#     lower_cpu(math.pow, float32, float32)(powf_cpu)
-#     lower_cpu(math.pow, float64, int32)(pow_cpu)
-#     lower_cpu(math.pow, float32, int32)(powf_cpu)
+def impl_pow():
+    # cpu
+    pow_cpu = gen_codegen('pow')
+    powf_cpu = gen_codegen('powf')
+    lower_cpu(math.pow, float64, float64)(pow_cpu)
+    lower_cpu(math.pow, float32, float32)(powf_cpu)
+    lower_cpu(math.pow, float64, int32)(pow_cpu)
+    lower_cpu(math.pow, float32, int32)(powf_cpu)
 
-#     # gpu
-#     pow_gpu = gen_codegen('__nv_pow')
-#     powf_gpu = gen_codegen('__nv_powf')
-#     powi_gpu = gen_codegen('__nv_powi')
-#     powif_gpu = gen_codegen('__nv_powif')
-#     lower_gpu(math.pow, float64, float64)(pow_gpu)
-#     lower_gpu(math.pow, float32, float32)(powf_gpu)
-#     lower_gpu(math.pow, float64, int32)(powi_gpu)
-#     lower_gpu(math.pow, float32, int32)(powif_gpu)
+    # gpu
+    # pow_gpu = gen_codegen('__nv_pow')
+    # powf_gpu = gen_codegen('__nv_powf')
+    # powi_gpu = gen_codegen('__nv_powi')
+    # powif_gpu = gen_codegen('__nv_powif')
+    # lower_gpu(math.pow, float64, float64)(pow_gpu)
+    # lower_gpu(math.pow, float32, float32)(powf_gpu)
+    # lower_gpu(math.pow, float64, int32)(powi_gpu)
+    # lower_gpu(math.pow, float32, int32)(powif_gpu)
 
 
-# impl_ldexp()
-# impl_pow()
+impl_ldexp()
+impl_pow()
 
 
 # CPU only:
diff --git a/rbc/heavydb/remoteheavydb.py b/rbc/heavydb/remoteheavydb.py
index c4da568d..5b272d86 100644
--- a/rbc/heavydb/remoteheavydb.py
+++ b/rbc/heavydb/remoteheavydb.py
@@ -1102,7 +1102,7 @@ def retrieve_targets(self):
                 target_info.add_library('stdlib')
                 target_info.add_library('heavydb')
             elif target_info.is_gpu:
-                if self.version < (6, 2):
+                if self.version < (6, 4):
                     # BC note: older heavydb versions do not define
                     # has_libdevice and assume that libdevice exists
                     self.has_cuda_libdevice = True
diff --git a/rbc/irtools.py b/rbc/irtools.py
index 881690eb..4701b17c 100644
--- a/rbc/irtools.py
+++ b/rbc/irtools.py
@@ -274,8 +274,11 @@ def compile_to_LLVM(functions_and_signatures,
     # * remotejit imports irtools
     # * irtools import heavydb
     # * heavydb import remotejit
-    from rbc.heavydb import JITRemoteTypingContext, JITRemoteTargetContext, \
-        heavydb_cpu_target, heavydb_gpu_target
+    from rbc.heavydb import (JITRemoteCPUTypingContext,
+                             JITRemoteGPUTypingContext,
+                             JITRemoteTargetContext,
+                             heavydb_cpu_target,
+                             heavydb_gpu_target)
 
     device = target_info.name
     software = target_info.software[0]
@@ -283,7 +286,8 @@ def compile_to_LLVM(functions_and_signatures,
     if software == 'HeavyDB':
         target_name = f'heavydb_{device}'
         target_desc = heavydb_cpu_target if device == 'cpu' else heavydb_gpu_target
-        typing_context = JITRemoteTypingContext()
+        typing_context = JITRemoteCPUTypingContext() if device == 'cpu' else \
+            JITRemoteGPUTypingContext()
         target_context = JITRemoteTargetContext(typing_context, target_name)
     else:
         target_name = 'cpu'
diff --git a/rbc/tests/heavydb/test_column_basic.py b/rbc/tests/heavydb/test_column_basic.py
index f44dced4..b9ef7f2d 100644
--- a/rbc/tests/heavydb/test_column_basic.py
+++ b/rbc/tests/heavydb/test_column_basic.py
@@ -892,17 +892,3 @@ def col_enumerate(x, y):
         f'select rowid, i4 from {heavydb.table_name} order by rowid;')
     for (r,), (_, e) in zip(list(result), list(expected_result)):
         assert r == e
-
-
-def test_foo(heavydb):
-    from rbc.externals.heavydb import set_output_row_size
-
-    @heavydb('int32(Column<int32>, OutputColumn<int32>)')
-    def col_enumerate(x, y):
-        sz = len(x)
-        # set_output_row_size(sz)
-        # for i, e in enumerate(x):
-        #     y[i] = e
-        return sz
-
-    heavydb.register()
diff --git a/rbc/tests/heavydb/test_math.py b/rbc/tests/heavydb/test_math.py
index 7ad019e2..faa07527 100644
--- a/rbc/tests/heavydb/test_math.py
+++ b/rbc/tests/heavydb/test_math.py
@@ -4,7 +4,6 @@
 import numpy as np
 
 import rbc.heavydb as rbc_heavydb
-from rbc.heavydb.mathimpl import rbc_INTR_TO_CMATH
 from rbc.stdlib import array_api
 from rbc.tests import heavydb_fixture
 
@@ -40,7 +39,7 @@ def heavydb():
     ('modf', 'double(double, double)'),
     ('perm', 'int(int, int)'),
     ('prod', 'int64(int64[])'),
-    ('remainder', 'double(double, double)'),
+    # ('remainder', 'double(double, double)'),
     ('trunc', 'double(double)'),
     # Power and logarithmic functions
     ('exp', 'double(double)'),
@@ -233,9 +232,9 @@ def test_math_function(heavydb, device, fn_name, signature):
     ('ldexp', 'double(double, int)', np.ldexp),
     ('frexp0', 'double(double)', lambda x: np.frexp(x)[0]),
     # Rounding functions:
-    ('around', 'double(double)', lambda x: np.around(x)),
-    ('round2',  # round and round_ are not good names
-     'double(double)', lambda x: np.round_(x)),  # force arity to 1
+    # ('around', 'double(double)', lambda x: np.around(x)),
+    # ('round2',  # round and round_ are not good names
+    #  'double(double)', lambda x: np.round_(x)),  # force arity to 1
     ('floor', 'double(double)', np.floor),
     ('ceil', 'double(double)', np.ceil),
     ('trunc', 'double(double)', np.trunc),